def __init__( self, manager_optimizer=None, optimizer=None, snn_optimizer=None, optimizer_args=None, step_size=1e-6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer( learning_rate=step_size, **optimizer_args) # I hope this is right self.manager_optimizer = manager_optimizer self.snn_optimizer = snn_optimizer self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio super(PG_concurrent, self).__init__(**kwargs) # not sure if this line is correct self.latents = latents self.period = period # todo: fix this sampler stuff self.sampler = HierBatchSampler(self, self.period) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = []
def __init__(self, optimizer=None, optimizer_args=None, step_size=0.003, num_latents=6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=10, use_skill_dependent_baseline=False, mlp_skill_dependent_baseline=False, freeze_manager=False, freeze_skills=False, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(Concurrent_PPO, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.period = period self.freeze_manager = freeze_manager self.freeze_skills = freeze_skills assert (not freeze_manager) or (not freeze_skills) # todo: fix this sampler stuff # import pdb; pdb.set_trace() self.sampler = HierBatchSampler(self, self.period) # self.sampler = BatchSampler(self) # i hope this is right self.diagonal = DiagonalGaussian(self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicy) if self.policy is not None: self.period = self.policy.period assert self.policy.period == self.period # self.old_policy = copy.deepcopy(self.policy) # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space new_obs_space_no_bi = curr_env.observation_space.shape[0] + 1 # 1 for the t_remaining skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents,) skill_dependent_obs_space = Box(-1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) if self.mlp_skill_dependent_baseline: self.skill_dependent_baseline = GaussianMLPBaseline(env_spec=skill_dependent_env_spec) else: self.skill_dependent_baseline = LinearFeatureBaseline(env_spec=skill_dependent_env_spec)
def __init__( self, optimizer=None, optimizer_args=None, step_size=1e-2, num_latents=6, latents=None, # some sort of iterable of the actual latent vectors period=10, # how often I choose a latent truncate_local_is_ratio=None, use_skill_dependent_baseline=False, **kwargs): Serializable.quick_init(self, locals()) if optimizer is None: default_args = dict(batch_size=None, max_epochs=1) if optimizer_args is None: optimizer_args = default_args else: optimizer_args = dict(default_args, **optimizer_args) optimizer = FirstOrderOptimizer(learning_rate=step_size, **optimizer_args) self.optimizer = optimizer self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio super(PG_concurrent_approx, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.period = period # todo: fix this sampler stuff self.sampler = HierBatchSampler(self, self.period) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] assert isinstance(self.policy, HierarchicalPolicy) if self.policy is not None: self.period = self.policy.period assert self.policy.period == self.period self.trainable_manager = self.policy.trainable_manager # skill dependent baseline self.use_skill_dependent_baseline = use_skill_dependent_baseline if use_skill_dependent_baseline: curr_env = kwargs['env'] skill_dependent_action_space = curr_env.action_space skill_dependent_obs_space_dim = ( (curr_env.observation_space.shape[0] + 1) * self.num_latents, ) skill_dependent_obs_space = Box( -1.0, 1.0, shape=skill_dependent_obs_space_dim) skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space) self.skill_dependent_baseline = LinearFeatureBaseline( env_spec=skill_depdendent_env_spec)
def __init__( self, optimizer=None, optimizer_args=None, step_size=0.0003, latents=None, # some sort of iterable of the actual latent vectors average_period=10, # average over all the periods truncate_local_is_ratio=None, epsilon=0.1, train_pi_iters=80, use_skill_dependent_baseline=False, **kwargs): if optimizer is None: if optimizer_args is None: # optimizer_args = dict() optimizer_args = dict(batch_size=None) self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args) self.step_size = step_size self.truncate_local_is_ratio = truncate_local_is_ratio self.epsilon = epsilon super(Hippo, self).__init__(**kwargs) # not sure if this line is correct self.num_latents = kwargs['policy'].latent_dim self.latents = latents self.average_period = average_period # import pdb; pdb.set_trace() self.sampler = BatchSampler(self) # i hope this is right self.diagonal = DiagonalGaussian( self.policy.low_policy.action_space.flat_dim) self.debug_fns = [] self.use_skill_dependent_baseline = use_skill_dependent_baseline assert isinstance(self.policy, HierarchicalPolicy) self.old_policy = copy.deepcopy(self.policy)
def __init__( self, input_shape, output_dim, mean_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, optimizer=None, use_trust_region=True, step_size=0.01, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_nonlinearity=None, normalize_inputs=True, normalize_outputs=True, name=None, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned. :param adaptive_std: Whether to make the std a function of the states. :param std_share_network: Whether to use the same network as the mean. :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if `std_share_network` is False. It defaults to the same architecture as the mean. :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network` is False. It defaults to the same non-linearity as the mean. """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer() else: optimizer = LbfgsOptimizer() self._optimizer = optimizer if mean_network is None: mean_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=None, ) l_mean = mean_network.output_layer if adaptive_std: l_log_std = MLP( input_shape=input_shape, input_var=mean_network.input_layer.input_var, output_dim=output_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_nonlinearity, output_nonlinearity=None, ).output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=output_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) LasagnePowered.__init__(self, [l_mean, l_log_std]) xs_var = mean_network.input_layer.input_var ys_var = TT.matrix("ys") old_means_var = TT.matrix("old_means") old_log_stds_var = TT.matrix("old_log_stds") x_mean_var = theano.shared(np.zeros((1, ) + input_shape), name="x_mean", broadcastable=(True, ) + (False, ) * len(input_shape)) x_std_var = theano.shared(np.ones((1, ) + input_shape), name="x_std", broadcastable=(True, ) + (False, ) * len(input_shape)) y_mean_var = theano.shared(np.zeros((1, output_dim)), name="y_mean", broadcastable=(True, False)) y_std_var = theano.shared(np.ones((1, output_dim)), name="y_std", broadcastable=(True, False)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var normalized_ys_var = (ys_var - y_mean_var) / y_std_var normalized_means_var = L.get_output( l_mean, {mean_network.input_layer: normalized_xs_var}) normalized_log_stds_var = L.get_output( l_log_std, {mean_network.input_layer: normalized_xs_var}) means_var = normalized_means_var * y_std_var + y_mean_var log_stds_var = normalized_log_stds_var + TT.log(y_std_var) normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var) dist = self._dist = DiagonalGaussian() normalized_dist_info_vars = dict(mean=normalized_means_var, log_std=normalized_log_stds_var) mean_kl = TT.mean( dist.kl_sym( dict(mean=normalized_old_means_var, log_std=normalized_old_log_stds_var), normalized_dist_info_vars, )) loss = -TT.mean( dist.log_likelihood_sym(normalized_ys_var, normalized_dist_info_vars)) self._f_predict = compile_function([xs_var], means_var) self._f_pdists = compile_function([xs_var], [means_var, log_stds_var]) self._l_mean = l_mean self._l_log_std = l_log_std optimizer_args = dict( loss=loss, target=self, network_outputs=[normalized_means_var, normalized_log_stds_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [ xs_var, ys_var, old_means_var, old_log_stds_var ] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs self._x_mean_var = x_mean_var self._x_std_var = x_std_var self._y_mean_var = y_mean_var self._y_std_var = y_std_var
def __init__( self, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_var if adaptive_std: std_network = MLP( input_shape=(obs_dim, ), input_layer=mean_network.input_layer, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) l_log_std = std_network.output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self.min_std = min_std mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(min_std)) self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy, self).__init__(env_spec) self._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def __init__( self, env_spec, mean_hidden_nonlinearity=tf.nn.relu, mean_hidden_sizes=(32, 32), std_hidden_nonlinearity=tf.nn.relu, std_hidden_sizes=(32, 32), min_std=1e-6, ): """ :param env_spec: :param mean_hidden_nonlinearity: nonlinearity used for the mean hidden layers :param mean_hidden_sizes: list of hidden_sizes for the fully-connected hidden layers :param std_hidden_nonlinearity: nonlinearity used for the std hidden layers :param std_hidden_sizes: list of hidden_sizes for the fully-connected hidden layers :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) super(GaussianMLPPolicy, self).__init__(env_spec) self.env_spec = env_spec gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) # Create network observation_dim = self.env_spec.observation_space.flat_dim self.observations_input = tf.placeholder(tf.float32, shape=[None, observation_dim]) action_dim = self.env_spec.action_space.flat_dim with tf.variable_scope('mean') as _: mlp_mean_output = tf_util.mlp(self.observations_input, observation_dim, mean_hidden_sizes, mean_hidden_nonlinearity) mlp_mean_output_size = mean_hidden_sizes[-1] self.mean = tf_util.linear(mlp_mean_output, mlp_mean_output_size, action_dim) with tf.variable_scope('log_std') as _: mlp_std_output = tf_util.mlp(self.observations_input, observation_dim, std_hidden_sizes, std_hidden_nonlinearity) mlp_std_output_size = std_hidden_sizes[-1] self.log_std = tf_util.linear(mlp_std_output, mlp_std_output_size, action_dim) self.std = tf.maximum(tf.exp(self.log_std), min_std) self._dist = DiagonalGaussian(action_dim) self.actions_output = tf.placeholder(tf.float32, shape=[None, action_dim]) z = (self.actions_output - self.mean) / self.std self.log_likelihood = (- tf.log(self.std**2) - z**2 * 0.5 - tf.log(2*np.pi) * 0.5)
def __init__( self, env_spec, env, latent_dim=2, latent_name='bernoulli', bilinear_integration=False, resample=False, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, min_std=1e-4, pkl_path=None, ): """ :param latent_dim: dimension of the latent variables :param latent_name: distribution of the latent variables :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever agent is reset, which can happen several times along the rollout with rollout in utils_snn) """ self.latent_dim = latent_dim ##could I avoid needing this self for the get_action? self.latent_name = latent_name self.bilinear_integration = bilinear_integration self.resample = resample self.min_std = min_std self.hidden_sizes = hidden_sizes self.pre_fix_latent = np.array( [] ) # if this is not empty when using reset() it will use this latent self.latent_fix = np.array( []) # this will hold the latents variable sampled in reset() self._set_std_to_0 = False self.pkl_path = pkl_path if self.pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path)) self.old_policy = data["policy"] self.latent_dim = self.old_policy.latent_dim self.latent_name = self.old_policy.latent_name self.bilinear_integration = self.old_policy.bilinear_integration self.resample = self.old_policy.resample # this could not be needed... self.min_std = self.old_policy.min_std self.hidden_sizes_snn = self.old_policy.hidden_sizes if latent_name == 'normal': self.latent_dist = DiagonalGaussian(self.latent_dim) self.latent_dist_info = dict(mean=np.zeros(self.latent_dim), log_std=np.zeros(self.latent_dim)) elif latent_name == 'bernoulli': self.latent_dist = Bernoulli(self.latent_dim) self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim)) elif latent_name == 'categorical': self.latent_dist = Categorical(self.latent_dim) if self.latent_dim > 0: self.latent_dist_info = dict(prob=1. / self.latent_dim * np.ones(self.latent_dim)) else: self.latent_dist_info = dict(prob=np.ones(self.latent_dim)) else: raise NotImplementedError Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) # retrieve dimensions from env! if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance( env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim) all_obs_dim = env_spec.observation_space.flat_dim assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim if self.bilinear_integration: obs_dim = self.obs_robot_dim + self.latent_dim +\ self.obs_robot_dim * self.latent_dim else: obs_dim = self.obs_robot_dim + self.latent_dim # here only if concat. action_dim = env_spec.action_space.flat_dim # for _ in range(10): # print("OK!") # print(obs_dim) # print(env_spec.observation_space.flat_dim) # print(self.latent_dim) mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="meanMLP", ) self._layers_mean = mean_network.layers l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if adaptive_std: log_std_network = MLP(input_shape=(obs_dim, ), input_var=obs_var, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, name="log_stdMLP") l_log_std = log_std_network.output_layer self._layers_log_std = log_std_network.layers else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self._layers_log_std = [l_log_std] self._layers_snn = self._layers_mean + self._layers_log_std # this returns a list with the "snn" layers if self.pkl_path: # restore from pkl file data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path)) warm_params = data['policy'].get_params_internal() self.set_params_snn(warm_params) mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy_snn_restorable, self).__init__(env_spec) self._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def __init__( self, env_spec, env, pkl_path=None, json_path=None, npz_path=None, trainable_snn=True, ##CF - latents units at the input latent_dim=3, # we keep all these as the dim of the output of the other MLP and others that we will need! latent_name='categorical', bilinear_integration=False, # again, needs to match! resample=False, # this can change: frequency of resampling the latent? hidden_sizes_snn=(32, 32), hidden_sizes_selector=(10, 10), external_latent=False, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, min_std=1e-4, ): self.latent_dim = latent_dim ## could I avoid needing this self for the get_action? self.latent_name = latent_name self.bilinear_integration = bilinear_integration self.resample = resample self.min_std = min_std self.hidden_sizes_snn = hidden_sizes_snn self.hidden_sizes_selector = hidden_sizes_selector self.pre_fix_latent = np.array([]) # if this is not empty when using reset() it will use this latent self.latent_fix = np.array([]) # this will hold the latents variable sampled in reset() self.shared_latent_var = theano.shared(self.latent_fix) # this is for external lat! update that self._set_std_to_0 = False self.trainable_snn = trainable_snn self.external_latent = external_latent self.pkl_path = pkl_path self.json_path = json_path self.npz_path = npz_path self.old_policy = None if self.json_path: # there is another one after defining all the NN to warm-start the params of the SNN data = json.load( open(os.path.join(config.PROJECT_PATH, self.json_path), 'r')) # I should do this with the json file self.old_policy_json = data['json_args']["policy"] self.latent_dim = self.old_policy_json['latent_dim'] self.latent_name = self.old_policy_json['latent_name'] self.bilinear_integration = self.old_policy_json['bilinear_integration'] self.resample = self.old_policy_json['resample'] # this could not be needed... self.min_std = self.old_policy_json['min_std'] self.hidden_sizes_snn = self.old_policy_json['hidden_sizes'] elif self.pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path)) self.old_policy = data["policy"] self.latent_dim = self.old_policy.latent_dim self.latent_name = self.old_policy.latent_name self.bilinear_integration = self.old_policy.bilinear_integration self.resample = self.old_policy.resample # this could not be needed... self.min_std = self.old_policy.min_std self.hidden_sizes_snn = self.old_policy.hidden_sizes if self.latent_name == 'normal': self.latent_dist = DiagonalGaussian(self.latent_dim) self.latent_dist_info = dict(mean=np.zeros(self.latent_dim), log_std=np.zeros(self.latent_dim)) elif self.latent_name == 'bernoulli': self.latent_dist = Bernoulli(self.latent_dim) self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim)) elif self.latent_name == 'categorical': self.latent_dist = Categorical(self.latent_dim) if self.latent_dim > 0: self.latent_dist_info = dict(prob=1. / self.latent_dim * np.ones(self.latent_dim)) else: self.latent_dist_info = dict(prob=np.ones(self.latent_dim)) # this is an empty array else: raise NotImplementedError Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) # retrieve dimensions and check consistency if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance(env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim) all_obs_dim = env_spec.observation_space.flat_dim assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim if self.external_latent: # in case we want to fix the latent externally l_all_obs_var = L.InputLayer(shape=(None,) + (self.obs_robot_dim + self.obs_maze_dim,)) all_obs_var = l_all_obs_var.input_var # l_selection = ConstOutputLayer(incoming=l_all_obs_var, output_var=self.shared_latent_var) l_selection = ParamLayer(incoming=l_all_obs_var, num_units=self.latent_dim, param=self.shared_latent_var, trainable=False) # Rui: change False to True? this is a simple layer that directly outputs self.shared_latent_var selection_var = L.get_output(l_selection) else: # create network with softmax output: it will be the latent 'selector'! latent_selection_network = MLP( input_shape=(self.obs_robot_dim + self.obs_maze_dim,), output_dim=self.latent_dim, hidden_sizes=self.hidden_sizes_selector, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_all_obs_var = latent_selection_network.input_layer all_obs_var = latent_selection_network.input_layer.input_var # collect the output to select the behavior of the robot controller (equivalent to latents) l_selection = latent_selection_network.output_layer selection_var = L.get_output(l_selection) # split all_obs into the robot and the maze obs --> ROBOT goes first!! l_obs_robot = CropLayer(l_all_obs_var, start_index=None, end_index=self.obs_robot_dim) l_obs_maze = CropLayer(l_all_obs_var, start_index=self.obs_robot_dim, end_index=None) # for _ in range(10): # print("OK!") # print(self.obs_robot_dim) # print(self.obs_maze_dim) obs_robot_var = all_obs_var[:, :self.obs_robot_dim] obs_maze_var = all_obs_var[:, self.obs_robot_dim:] # Enlarge obs with the selectors (or latents). Here just computing the final input dim if self.bilinear_integration: l_obs_snn = BilinearIntegrationLayer([l_obs_robot, l_selection]) else: l_obs_snn = L.ConcatLayer([l_obs_robot, l_selection]) action_dim = env_spec.action_space.flat_dim # create the action network mean_network = MLP( input_layer=l_obs_snn, # input the layer that handles the integration of the selector output_dim=action_dim, hidden_sizes=self.hidden_sizes_snn, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="meanMLP", ) self._layers_mean = mean_network.layers l_mean = mean_network.output_layer if adaptive_std: log_std_network = MLP( input_layer=l_obs_snn, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, name="log_stdMLP" ) l_log_std = log_std_network.output_layer self._layers_log_std = log_std_network.layers else: l_log_std = ParamLayer( incoming=mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self._layers_log_std = [l_log_std] self._layers_snn = self._layers_mean + self._layers_log_std # this returns a list with the "snn" layers if not self.trainable_snn: for layer in self._layers_snn: for param, tags in layer.params.items(): # params of layer are OrDict: key=the shared var, val=tags tags.remove("trainable") if self.json_path and self.npz_path: warm_params_dict = dict(np.load(os.path.join(config.PROJECT_PATH, self.npz_path))) # keys = list(param_dict.keys()) self.set_params_snn(warm_params_dict) elif self.pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path)) warm_params = data['policy'].get_params_internal() self.set_params_snn(warm_params) mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy_snn_hier, self).__init__(env_spec) # debug obs_snn_var = L.get_output(l_obs_snn) self._l_obs_snn = ext.compile_function( inputs=[all_obs_var], outputs=obs_snn_var, ) # self._log_std = ext.compile_function( # inputs=[all_obs_var], # outputs=log_std_var, # ) self._mean = ext.compile_function( inputs=[all_obs_var], outputs=mean_var, ) self._f_dist = ext.compile_function( inputs=[all_obs_var], outputs=[mean_var, log_std_var], ) # if I want to monitor the selector output self._f_select = ext.compile_function( inputs=[all_obs_var], outputs=selection_var, )
def __init__( self, env_spec, env, pkl_paths=(), json_paths=(), npz_paths=(), trainable_old=True, external_selector=False, hidden_sizes_selector=(10, 10), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, min_std=1e-4, ): """ :param pkl_paths: tuple/list of pkl paths :param json_paths: tuple/list of json paths :param npz_paths: tuple/list of npz paths :param trainable_old: Are the old policies still trainable :param external_selector: is the linear combination of the old policies outputs fixed externally :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std """ # define where are the old policies to use and what to do with them: self.trainable_old = trainable_old # whether to keep training the old policies loaded here self.pkl_paths = pkl_paths self.json_paths = json_paths self.npz_paths = npz_paths self.selector_dim = max( len(json_paths), len(pkl_paths)) # pkl could be zero if giving npz # if not use a selector NN here, just externally fixed selector variable: self.external_selector = external_selector # whether to use the selectorNN defined here or the pre_fix_selector self.pre_fix_selector = np.zeros( (self.selector_dim) ) # if this is not empty when using reset() it will use this selector self.selector_fix = np.zeros( (self.selector_dim )) # this will hold the selectors variable sampled in reset() self.shared_selector_var = theano.shared( self.selector_fix) # this is for external selector! update that # else, describe the MLP used: self.hidden_sizes_selector = hidden_sizes_selector # size of the selector NN defined here self.min_std = min_std self._set_std_to_0 = False self.action_dim = env_spec.action_space.flat_dim # not checking that all the old policies have this act_dim self.old_hidden_sizes = [] # assume json always given for json_path in self.json_paths: data = json.load( open(os.path.join(config.PROJECT_PATH, json_path), 'r')) old_json_policy = data['json_args']["policy"] self.old_hidden_sizes.append(old_json_policy['hidden_sizes']) # retrieve dimensions and check consistency if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance( env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim) all_obs_dim = env_spec.observation_space.flat_dim assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) if self.external_selector: # in case we want to fix the selector externally l_all_obs_var = L.InputLayer( shape=(None, ) + (self.obs_robot_dim + self.obs_maze_dim, )) all_obs_var = l_all_obs_var.input_var l_selection = ParamLayer(incoming=l_all_obs_var, num_units=self.selector_dim, param=self.shared_selector_var, trainable=False) selection_var = L.get_output(l_selection) else: # create network with softmax output: it will be the selector! selector_network = MLP( input_shape=(self.obs_robot_dim + self.obs_maze_dim, ), output_dim=self.selector_dim, hidden_sizes=self.hidden_sizes_selector, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_all_obs_var = selector_network.input_layer all_obs_var = selector_network.input_layer.input_var # collect the output to select the behavior of the robot controller (equivalent to selectors) l_selection = selector_network.output_layer selection_var = L.get_output(l_selection) # split all_obs into the robot and the maze obs --> ROBOT goes first!! l_obs_robot = CropLayer(l_all_obs_var, start_index=None, end_index=self.obs_robot_dim) l_obs_maze = CropLayer(l_all_obs_var, start_index=self.obs_robot_dim, end_index=None) obs_robot_var = all_obs_var[:, :self.obs_robot_dim] obs_maze_var = all_obs_var[:, self.obs_robot_dim:] # create the action networks self.old_l_means = [ ] # I do this self in case I wanna access it from reset self.old_l_log_stds = [] self.old_layers = [] for i in range(self.selector_dim): mean_network = MLP( input_layer=l_obs_robot, output_dim=self.action_dim, hidden_sizes=self.old_hidden_sizes[i], hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="meanMLP{}".format(i), ) self.old_l_means.append(mean_network.output_layer) self.old_layers += mean_network.layers l_log_std = ParamLayer( incoming=mean_network.input_layer, num_units=self.action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std{}".format(i), trainable=learn_std, ) self.old_l_log_stds.append(l_log_std) self.old_layers += [l_log_std] if not self.trainable_old: for layer in self.old_layers: for param, tags in layer.params.items( ): # params of layer are OrDict: key=the shared var, val=tags tags.remove("trainable") if self.json_paths and self.npz_paths: old_params_dict = {} for i, npz_path in enumerate(self.npz_paths): params_dict = dict( np.load(os.path.join(config.PROJECT_PATH, npz_path))) renamed_warm_params_dict = {} for key in params_dict.keys(): if key == 'output_log_std.param': old_params_dict['output_log_std{}.param'.format( i)] = params_dict[key] elif 'meanMLP_' == key[:8]: old_params_dict['meanMLP{}_'.format(i) + key[8:]] = params_dict[key] else: old_params_dict['meanMLP{}_'.format(i) + key] = params_dict[key] self.set_old_params(old_params_dict) elif self.pkl_paths: old_params_dict = {} for i, pkl_path in enumerate(self.pkl_paths): data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path)) params = data['policy'].get_params_internal() for param in params: if param.name == 'output_log_std.param': old_params_dict['output_log_std{}.param'.format( i)] = param.get_value() elif 'meanMLP_' == param.name[:8]: old_params_dict['meanMLP{}_'.format(i) + param.name[8:]] = param.get_value() else: old_params_dict['meanMLP{}_'.format(i) + param.name] = param.get_value() self.set_old_params(old_params_dict) # new layers actually selecting the correct output l_mean = SumProdLayer(self.old_l_means + [l_selection]) l_log_std = SumProdLayer(self.old_l_log_stds + [l_selection]) mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(self.action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy_multi_hier, self).__init__(env_spec) self._f_old_means = ext.compile_function( inputs=[all_obs_var], outputs=[ L.get_output(l_old_mean) for l_old_mean in self.old_l_means ]) self._f_all_inputs = ext.compile_function( inputs=[all_obs_var], outputs=[ L.get_output(l_old_mean) for l_old_mean in self.old_l_means ] + [selection_var]) self._f_dist = ext.compile_function( inputs=[all_obs_var], outputs=[mean_var, log_std_var], ) # if I want to monitor the selector output self._f_select = ext.compile_function( inputs=[all_obs_var], outputs=selection_var, )
def __init__( self, env_spec, latent_dim=2, latent_name='bernoulli', bilinear_integration=False, resample=False, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, min_std=1e-4, ): """ :param latent_dim: dimension of the latent variables :param latent_name: distribution of the latent variables :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever agent is reset, which can happen several times along the rollout with rollout in utils_snn) """ self.latent_dim = latent_dim ##could I avoid needing this self for the get_action? self.latent_name = latent_name self.bilinear_integration = bilinear_integration self.resample = resample self.min_std = min_std self.hidden_sizes = hidden_sizes self.pre_fix_latent = np.array( [] ) # if this is not empty when using reset() it will use this latent self.latent_fix = np.array( []) # this will hold the latents variable sampled in reset() self._set_std_to_0 = False if latent_name == 'normal': self.latent_dist = DiagonalGaussian(self.latent_dim) self.latent_dist_info = dict(mean=np.zeros(self.latent_dim), log_std=np.zeros(self.latent_dim)) elif latent_name == 'bernoulli': self.latent_dist = Bernoulli(self.latent_dim) self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim)) elif latent_name == 'categorical': self.latent_dist = Categorical(self.latent_dim) if self.latent_dim > 0: self.latent_dist_info = dict(prob=1. / self.latent_dim * np.ones(self.latent_dim)) else: self.latent_dist_info = dict(prob=np.ones(self.latent_dim)) else: raise NotImplementedError Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) if self.bilinear_integration: obs_dim = env_spec.observation_space.flat_dim + latent_dim +\ env_spec.observation_space.flat_dim * latent_dim else: obs_dim = env_spec.observation_space.flat_dim + latent_dim # here only if concat. action_dim = env_spec.action_space.flat_dim mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="meanMLP", ) l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if adaptive_std: l_log_std = MLP(input_shape=(obs_dim, ), input_var=obs_var, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, name="log_stdMLP").output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy_snn, self).__init__(env_spec) self._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def __init__( self, env_spec, input_latent_vars=None, hidden_sizes=(32, 32), hidden_latent_vars=None, learn_std=True, init_std=1.0, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, ): Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network mean_network = StochasticMLP( input_shape=(obs_dim, ), input_latent_vars=input_latent_vars, output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_latent_vars=hidden_latent_vars, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self._mean_network = mean_network self._n_latent_layers = len(mean_network.latent_layers) self._l_mean = l_mean self._l_log_std = l_log_std LasagnePowered.__init__(self, [l_mean, l_log_std]) super(StochasticGaussianMLPPolicy, self).__init__(env_spec) outputs = self.dist_info_sym(mean_network.input_var) latent_keys = sorted( set(outputs.keys()).difference({"mean", "log_std"})) extras = get_full_output([self._l_mean, self._l_log_std] + self._mean_network.latent_layers, )[1] latent_distributions = [ extras[layer]["distribution"] for layer in self._mean_network.latent_layers ] self._latent_keys = latent_keys self._latent_distributions = latent_distributions self._dist = DiagonalGaussian(action_dim) self._f_dist_info = ext.compile_function( inputs=[obs_var], outputs=outputs, ) self._f_dist_info_givens = None