def test_flatdim(base_space: gym.Space): sparse_space = Sparse(base_space, sparsity=0.) base_flat_dims = flatdim(base_space) sparse_flat_dims = flatdim(sparse_space) assert base_flat_dims == sparse_flat_dims
def __init__(self, env, expert_trajs=None, discrim_arch=relu_net, discrim_arch_args={}, name='gail'): super(GAIL, self).__init__() self.dO = flatdim(env.observation_space) self.dU = flatdim(env.action_space) self.set_demos(expert_trajs) # build energy model with tf.variable_scope(name) as vs: # Should be batch_size x T x dO/dU self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs') self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act') self.labels = tf.placeholder(tf.float32, [None, 1], name='labels') self.lr = tf.placeholder(tf.float32, (), name='lr') obs_act = tf.concat([self.obs_t, self.act_t], axis=1) logits = discrim_arch(obs_act, **discrim_arch_args) self.predictions = tf.nn.sigmoid(logits) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.labels)) self.step = tf.train.AdamOptimizer(learning_rate=self.lr).minimize( self.loss) self._make_param_ops(vs)
def unflatten(space, x): if isinstance(space, Box): return np.asarray(x, dtype=space.dtype).reshape(space.shape) elif isinstance(space, Discrete): return int(np.nonzero(x)[0][0]) elif isinstance(space, MultiBinary): return np.asarray(x, dtype=space.dtype).reshape(space.shape) elif isinstance(space, MultiDiscrete): return np.asarray(x, dtype=space.dtype).reshape(space.shape) elif isinstance(space, Tuple): dims = [flatdim(s) for s in space.spaces] list_flattened = np.split(x, np.cumsum(dims)[:-1]) list_unflattened = [ unflatten(s, flattened) for flattened, s in zip(list_flattened, space.spaces) ] return tuple(list_unflattened) elif isinstance(space, Dict): dims = [flatdim(s) for s in space.spaces.values()] list_flattened = np.split(x, np.cumsum(dims)[:-1]) list_unflattened = [ (key, unflatten(s, flattened)) for flattened, (key, s) in zip(list_flattened, space.spaces.items()) ] return dict(list_unflattened) else: raise NotImplementedError
def __init__(self, observation_space, action_space): self.observation_space = observation_space self.action_space = action_space self.item_dim = flatdim(self.observation_space['equipped_items']) + \ flatdim(self.observation_space['inventory']) self.actor = Actor(self.action_space.nvec, self.item_dim).to(device) self.critic = Critic(self.item_dim).to(device) self.discriminator = Discriminator(self.action_space.nvec, self.item_dim).to(device) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=LEARNING_RATE) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=LEARNING_RATE) self.discriminator_optim = torch.optim.Adam( self.discriminator.parameters(), lr=LEARNING_RATE) self.mse_loss = nn.MSELoss(reduction='none') self.bce_loss = nn.BCELoss() self.last_lp = None self.memory = []
def test_flatdim(base_space: gym.Space): sparse_space = Sparse(base_space, sparsity=0.) base_flat_dims = flatdim(base_space) sparse_flat_dims = flatdim(sparse_space) assert base_flat_dims == sparse_flat_dims # The flattened dimensions shouldn't depend on the sparsity. sparse_space = Sparse(base_space, sparsity=1.) sparse_flat_dims = flatdim(sparse_space) assert base_flat_dims == sparse_flat_dims
def __init__(self, env, expert_trajs=None, discrim_arch=relu_net, discrim_arch_args={}, l2_reg=0, discount=1.0, max_itrs=100, name='gcl'): super(AIRLStateAction, self).__init__() self.dO = flatdim(env.observation_space) self.dU = flatdim(env.action_space) self.set_demos(expert_trajs) self.max_itrs = max_itrs # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs') self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act') self.labels = tf.placeholder(tf.float32, [None, 1], name='labels') self.lprobs = tf.placeholder(tf.float32, [None, 1], name='log_probs') self.lr = tf.placeholder(tf.float32, (), name='lr') obs_act = tf.concat([self.obs_t, self.act_t], axis=1) with tf.variable_scope('discrim') as dvs: with tf.variable_scope('energy'): self.energy = discrim_arch(obs_act, **discrim_arch_args) # we do not learn a separate log Z(s) because it is impossible to separate from the energy # In a discrete domain we can explicitly normalize to calculate log Z(s) log_p_tau = -self.energy discrim_vars = tf.get_collection('reg_vars', scope=dvs.name) log_q_tau = self.lprobs if l2_reg > 0: reg_loss = l2_reg * tf.reduce_sum( [tf.reduce_sum(tf.square(var)) for var in discrim_vars]) else: reg_loss = 0 log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.d_tau = tf.exp(log_p_tau - log_pq) cent_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) + (1 - self.labels) * (log_q_tau - log_pq)) self.loss = cent_loss + reg_loss self.step = tf.train.AdamOptimizer(learning_rate=self.lr).minimize( self.loss) self._make_param_ops(_vs)
def multidiscrete_encoder(input_space: spaces.MultiDiscrete, output_space: Space, budget: int = None) -> nn.Module: # if input_space.shape[0] input_dims = flatdim(input_space) output_dims = flatdim(output_space) assert isinstance(output_space, spaces.Box), "Only support Box output spaces for now." return nn.Sequential( LambdaModule(lambda v: v.reshape([-1, input_dims]).to(torch.float)), nn.Linear(input_dims, output_dims), )
def forward(self, observations: ContinualRLSetting.Observations, representations: Tensor) -> PolicyHeadOutput: """ Forward pass of a Policy head. TODO: Do we actually need the observations here? It is here so we have access to the 'done' from the env, but do we really need it here? or would there be another (cleaner) way to do this? """ if len(representations.shape) < 2: # Flatten the representations. representations = representations.reshape( [-1, flatdim(self.input_space)]) # Setup the buffers, which will hold the most recent observations, # actions and rewards within the current episode for each environment. if not self.batch_size: self.batch_size = representations.shape[0] self.create_buffers() representations = representations.float() logits = self.dense(representations) # The policy is the distribution over actions given the current state. action_dist = Categorical(logits=logits) sample = action_dist.sample() actions = PolicyHeadOutput( y_pred=sample, logits=logits, action_dist=action_dist, ) return actions
def __init__( self, num_agents: int, action_spaces: List[Space], observation_spaces: List[Space], gamma: float, **kwargs ): """Constructor of base agent for Q-Learning Initializes basic variables of MARL agents namely epsilon, learning rate and discount rate. :param num_agents (int): number of agents :param action_spaces (List[Space]): action spaces of the environment for each agent :param observation_spaces (List[Space]): observation spaces of the environment for each agent :param gamma (float): discount factor (gamma) :attr n_acts (List[int]): number of actions for each agent """ self.num_agents = num_agents self.action_spaces = action_spaces self.observation_spaces = observation_spaces self.n_acts = [flatdim(action_space) for action_space in action_spaces] self.gamma: float = gamma
def __init__(self, learning_rate: float =0.5, epsilon: float =1.0, **kwargs): """Constructor of JointActionLearning :param learning_rate (float): learning rate for Q-learning updates :param epsilon (float): epsilon value for all agents :attr q_tables (List[DefaultDict]): tables for Q-values mapping (OBS, ACT) pairs of observations and joint actions to respective Q-values for all agents :attr models (List[DefaultDict[DefaultDict]]): each agent holding model of other agent mapping observation to other agent actions to count of other agent action Initializes some variables of the Joint Action Learning agents, namely the epsilon, discount rate and learning rate """ super().__init__(**kwargs) self.learning_rate = learning_rate self.epsilon = epsilon self.n_acts = [flatdim(action_space) for action_space in self.action_spaces] # initialise Q-tables for all agents self.q_tables: List[DefaultDict] = [defaultdict(lambda: 0) for _ in range(self.num_agents)] # initialise models for each agent mapping state to other agent actions to count of other agent action # in state self.models = [defaultdict(lambda: defaultdict(lambda: 0)) for _ in range(self.num_agents)] # count observations - count for each agent self.c_obss = [defaultdict(lambda: 0) for _ in range(self.num_agents)]
def __init__(self, action_space: Space, obs_space: Space, gamma: float, epsilon: float, **kwargs): """Constructor of base agent for Q-Learning Initializes basic variables of the Q-Learning agent namely the epsilon, learning rate and discount rate. :param action_space (int): action space of the environment :param obs_space (int): observation space of the environment :param gamma (float): discount factor (gamma) :param epsilon (float): epsilon for epsilon-greedy action selection :attr n_acts (int): number of actions :attr q_table (DefaultDict): table for Q-values mapping (OBS, ACT) pairs of observations and actions to respective Q-values """ self.action_space = action_space self.obs_space = obs_space self.n_acts = flatdim(action_space) self.epsilon: float = epsilon self.gamma: float = gamma self.q_table: DefaultDict = defaultdict(lambda: 0) self.epsilon_initial = self.epsilon ##### CHECK THIS
def test_flatten_space_boxes(space): flat_space = utils.flatten_space(space) assert isinstance(flat_space, Box), f"Expected {type(flat_space)} to equal {Box}" flatdim = utils.flatdim(space) (single_dim, ) = flat_space.shape assert single_dim == flatdim, f"Expected {single_dim} to equal {flatdim}"
def test_flatten_space_boxes(space): flat_space = utils.flatten_space(space) assert isinstance(flat_space, Box), "Expected {} to equal {}".format( type(flat_space), Box) flatdim = utils.flatdim(space) (single_dim, ) = flat_space.shape assert single_dim == flatdim, "Expected {} to equal {}".format( single_dim, flatdim)
def box_encoder(input_space: spaces.Box, output_space: Space, budget: int = None, hidden_dims: List[int] = None, **kwargs) -> nn.Module: input_dims = flatdim(input_space) output_dims = flatdim(output_space) assert isinstance(output_space, spaces.Box), "only support box output shape for now." if is_image_space(input_space): return image_encoder(input_space, output_space, budget=budget, **kwargs) if hidden_dims is None: if budget is not None: # There are, in total, this many parameters, as a function of the input # size, hidden size, output size, and number of layers. # Would be cool if we could determine the hidden_dims given the budget. # n_params = ( # input_dims * hidden_dims + hidden_dims + # first dense layer # (hidden_dims * hidden_dims + hidden_dims) * (n_layers - 2) + # Hidden layers # hidden_dims * output_dims + hidden_dims # Output layer # ) n_layers = 3 hidden_dim = round(math.sqrt(budget // n_layers)) hidden_dims = [hidden_dim for _ in range(n_layers)] else: hidden_dims = [ 64, 64, 64, ] return MLP( input_dims, *hidden_dims, output_dims=output_dims, )
def discrete_encoder(input_space: spaces.Discrete, output_space: Space, budget: int = None, **kwargs) -> nn.Module: # Just tile / copy the input value into a tensor. assert isinstance(output_space, spaces.Box), "Only support box output space for now." assert output_space.dtype == np.float32, output_space.dtype output_dims = flatdim(output_space) return Tile(output_dims) return LambdaModule( lambda v: torch.empty(output_space.shape, dtype=torch.float).fill_(v))
def __init__( self, agent_id, obs_space, action_space, lr, adam_eps, recurrent_policy, num_steps, num_processes, device, ): self.agent_id = agent_id self.obs_size = flatdim(obs_space) self.action_size = flatdim(action_space) self.obs_space = obs_space self.action_space = action_space self.model = Policy( obs_space, action_space, base_kwargs={"recurrent": recurrent_policy}, ) self.storage = RolloutStorage( obs_space, action_space, self.model.recurrent_hidden_state_size, num_steps, num_processes, ) self.model.to(device) self.optimizer = optim.Adam(self.model.parameters(), lr, eps=adam_eps) # self.intr_stats = RunningStats() self.saveables = { "model": self.model, "optimizer": self.optimizer, }
def __init__(self, input_space: gym.Space, action_space: gym.Space, reward_space: gym.Space = None, hparams: "OutputHead.HParams" = None, name: str = ""): super().__init__() self.input_space = input_space self.action_space = action_space self.reward_space = reward_space or spaces.Box(-np.inf, np.inf, ()) self.input_size = flatdim(input_space) self.hparams = hparams or self.HParams() if not isinstance(self.hparams, self.HParams): # Upgrade the hparams to the right type, if needed. self.hparams = self.upgrade_hparams() self.name = name or type(self).name
def create_shmem_space(multiproc_context, space): if np.any([ isinstance(space, t) for t in [Box, Discrete, MultiBinary, MultiDiscrete] ]): return multiproc_context.Array(_NP_TO_CT[space.dtype], flatdim(space)) elif isinstance(space, Dict): class TempStruct(ctypes.Structure): _fields_ = [(key, create_shmem_observation(multiproc_context, subspace)) for key, subspace in space.spaces.items()] return TempStruct() elif isinstance(space, Tuple): class TempStruct(ctypes.Structure): _fields_ = [(i, create_shmem_observation(multiproc_context, subspace)) for i, subspace in enumerate(space.spaces)] return TempStruct() else: raise NotImplementedError
def test_flatten_dim(space): sample = utils.flatten(space, space.sample()) (single_dim, ) = sample.shape flatdim = utils.flatdim(space) assert single_dim == flatdim, "Expected {} to equal {}".format( single_dim, flatdim)
def test_flatten_dim(space): sample = utils.flatten(space, space.sample()) (single_dim,) = sample.shape flatdim = utils.flatdim(space) assert single_dim == flatdim, f"Expected {single_dim} to equal {flatdim}"
def test_flatdim(space, flatdim): dim = utils.flatdim(space) assert dim == flatdim, f"Expected {dim} to equal {flatdim}"
def __init__(self, setting: SettingType, hparams: HParams, config: Config): super().__init__() self.setting: SettingType = setting self.hp: BaseModel.HParams = hparams self.Observations: Type[Observations] = setting.Observations self.Actions: Type[Actions] = setting.Actions self.Rewards: Type[Rewards] = setting.Rewards self.observation_space: gym.Space = setting.observation_space self.action_space: gym.Space = setting.action_space self.reward_space: gym.Space = setting.reward_space self.input_shape = self.observation_space[0].shape self.reward_shape = self.reward_space.shape self.split_batch_transform = SplitBatch(observation_type=self.Observations, reward_type=self.Rewards) self.config: Config = config # TODO: Decided to Not set this property, so the trainer doesn't # fallback to using it instead of the passed datamodules/dataloaders. # self.datamodule: LightningDataModule = setting # (Testing) Setting this attribute is supposed to help with ddp/etc # training in pytorch-lightning. Not 100% sure. # self.example_input_array = torch.rand(self.batch_size, *self.input_shape) # Create the encoder and the output head. # Space of our encoder representations. self.representation_space: gym.Space if isinstance(setting, ContinualRLSetting) and setting.observe_state_directly: # ISSUE # 62: Need to add a dense network instead of no encoder, and # change the PolicyHead to have only one layer. # Only pass the image, not the task labels to the encoder (for now). input_dims = flatdim(self.observation_space[0]) output_dims = self.hp.new_hidden_size or 128 self.encoder = FCNet( in_features=input_dims, out_features=output_dims, hidden_layers=3, hidden_neurons=[256, 128, output_dims], activation=nn.ReLU, ) self.representation_space = add_tensor_support( spaces.Box(low=-np.inf, high=np.inf, shape=[output_dims]) ) self.hidden_size = output_dims else: # TODO: Refactor this 'make_encoder' being on the hparams, its a bit # weird. self.encoder, self.hidden_size = self.hp.make_encoder() # TODO: Check that the outputs of the encoders are actually # flattened. I'm not sure they all are, which case the samples # wouldn't match with this space. self.representation_space = spaces.Box(-np.inf, np.inf, (self.hidden_size,), np.float32) logger.info(f"Moving encoder to device {self.config.device}") self.encoder = self.encoder.to(self.config.device) self.representation_space = add_tensor_support(self.representation_space) self.output_head: OutputHead = self.create_output_head(setting, task_id=None)
def update_anchor_weights(self, new_task_id: int) -> None: """Update the FIMs and other EWC params before starting training on a new task. Parameters ---------- new_task_id : int The ID of the new task. """ # we dont want to go here at test time. # NOTE: We also switch between unknown tasks. logger.info( f"Updating the EWC 'anchor' weights before starting training on " f"task {new_task_id}") self.previous_model_weights = self.get_current_model_weights().clone( ).detach() # Create a Dataloader from the stored observations. obs_type: Type[Observations] = type(self.observation_collector[0]) dataset = [obs.as_namedtuple() for obs in self.observation_collector] # Or, alternatively (see the note below on why we don't use this): # stacked_observations: Observations = obs_type.stack(self.observation_collector) # dataset = TensorDataset(*stacked_observations.as_namedtuple()) # NOTE: This is equivalent to just using the same batch size as during # training, as each Observations in the list is already a batch. # NOTE: We keep the same batch size here as during training because for # instance in RL, it would be weird to suddenly give some new batch size, # since the buffers would get cleared and re-created just for these forward # passes dataloader = DataLoader(dataset, batch_size=None, collate_fn=None) # TODO: Would be nice to have a progress bar here. # Create the parameters to be passed to the FIM function. These may vary a # bit, depending on if we're being applied in a classification setting or in # a regression setting (not done yet) variant: str # TODO: Change this conditional to be based on the type of action space, rather # than of output head. if isinstance(self._model.output_head, ClassificationHead): variant = "classif_logits" n_output = self._model.action_space.n def fim_function(*inputs) -> Tensor: observations = obs_type(*inputs).to(self._model.device) forward_pass: ForwardPass = self._model(observations) actions = forward_pass.actions return actions.logits elif isinstance(self._model.output_head, RegressionHead): # NOTE: This hasn't been tested yet. variant = "regression" n_output = flatdim(self._model.action_space) def fim_function(*inputs) -> Tensor: observations = obs_type(*inputs).to(self._model.device) forward_pass: ForwardPass = self._model(observations) actions = forward_pass.actions return actions.y_pred else: raise NotImplementedError("TODO") with self._ignoring_task_boundaries(): # Prevent recursive calls to `on_task_switch` from affecting us (can be # called from MultiheadModel). (TODO: MultiheadModel will be fixed soon.) # layer_collection = LayerCollection.from_model(self.model.shared_modules()) # nngeometry BUG: this doesn't work when passing the layer # collection instead of the model new_fim = FIM( model=self.model.shared_modules(), loader=dataloader, representation=self.options.fim_representation, n_output=n_output, variant=variant, function=fim_function, device=self._model.device, layer_collection=None, ) # TODO: There was maybe an idea to use another fisher information matrix for # the critic in A2C, but not doing that atm. new_fims = [new_fim] self.consolidate(new_fims, task=new_task_id) self.observation_collector.clear()
from tensorflow.python.keras.callbacks import TensorBoard from rl.agents.dqn import DQNAgent from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy from rl.memory import SequentialMemory ENV_NAME = 'card_game-v0' # Get the environment and extract the number of actions. tensor_board = TensorBoard('/tmp/tensor') env = gym.make(ENV_NAME) nb_actions = env.action_space.n # Model obs_dims = flatdim(env.observation_space) model = Sequential() model.add(Flatten(input_shape=(1, obs_dims))) model.add(Dense(24)) model.add(Activation('relu')) model.add(Dense(12)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Agent # gamma = 0.95 # epsilon = 1.0 # epsilon_min = 0.01
def on_task_switch(self, task_id: Optional[int]): """ Executed when the task switches (to either a known or unknown task). """ if not self.enabled: return logger.info(f"On task switch called: task_id={task_id}") if self._shared_net is None: logger.info( f"On task switch called: task_id={task_id}, EWC cannot be " f"applied as there are no shared weights.") elif self.previous_task is None and self.n_switches == 0 and not task_id: self.previous_task = task_id logger.info("Starting the first task, no EWC update.") self.n_switches += 1 elif self._model.training and (task_id is None or task_id > self.previous_task): # we dont want to go here at test time. # NOTE: We also switch between unknown tasks. logger.info(f"Switching tasks: {self.previous_task} -> {task_id}: " f"Updating the EWC 'anchor' weights.") self.previous_task = task_id device = self._model.config.device self.previous_model_weights = (PVector.from_model( self._shared_net.to(device)).clone().detach()) # Create a Dataloader from the stored observations. obs_type: Type[Observations] = type(self.observation_collector[0]) dataset = [ obs.as_namedtuple() for obs in self.observation_collector ] # Or, alternatively (see the note below on why we don't use this): # stacked_observations: Observations = obs_type.stack(self.observation_collector) # dataset = TensorDataset(*stacked_observations.as_namedtuple()) # NOTE: This is equivalent to just using the same batch size as during # training, as each Observations in the list is already a batch. # NOTE: We keep the same batch size here as during training because for # instance in RL, it would be weird to suddenly give some new batch size, # since the buffers would get cleared and re-created just for these forward # passes dataloader = DataLoader(dataset, batch_size=None, collate_fn=None) # Create the parameters to be passed to the FIM function. These may vary a # bit, depending on if we're being applied in a classification setting or in # a regression setting (not done yet) variant: str if isinstance(self._model.output_head, ClassificationHead): variant = "classif_logits" n_output = self._model.action_space.n def fim_function(*inputs) -> Tensor: observations = obs_type(*inputs).to(self._model.device) forward_pass: ForwardPass = self._model(observations) actions = forward_pass.actions return actions.logits elif isinstance(self._model.output_head, RegressionHead): # NOTE: This hasn't been tested yet. variant = "regression" n_output = flatdim(self._model.action_space) def fim_function(*inputs) -> Tensor: observations = obs_type(*inputs).to(self._model.device) forward_pass: ForwardPass = self._model(observations) actions = forward_pass.actions return actions.y_pred else: raise NotImplementedError("TODO") new_fim = FIM( model=self._shared_net, loader=dataloader, representation=self.options.fim_representation, n_output=n_output, variant=variant, function=fim_function, device=self._model.device, ) # TODO: There was maybe an idea to use another fisher information matrix for # the critic in A2C, but not doing that atm. new_fims = [new_fim] self.consolidate(new_fims, task=self.previous_task) self.n_switches += 1 self.observation_collector.clear()
def dict_encoder(input_space: spaces.Dict, output_space: Space, budget: int = None, shared_budget: int = None, hidden_dims: int = 512, split: Dict[str, Any] = None, shared: Dict[str, Any] = None, **kwargs) -> nn.Module: """ IDEA: Create an encoder for each item in the dict, mapping from the corresponding input space to some kind of latent space, and then add a flatten/concat layer, then map to the provided output space. shared_budget: The budget for the shared portion of the network. Must be less than the `budget`. If only `budget` is given, the shared budget is set to 1/2 of the total budget. """ split = split or {} shared = shared or {} if kwargs: warnings.warn( RuntimeWarning( f"Ignoring kwargs {kwargs}! (This acceps 'split' and 'shared' to " f"hold the hparams of the split and shared portions of the " f"encoder).")) total_input_dims = flatdim(input_space) total_output_dims = flatdim(output_space) n_inputs = len(input_space.spaces) if not isinstance(output_space, spaces.Box): raise NotImplementedError("Only support Box output spaces for now.") split_budget: Optional[int] = None shared_budget: Optional[int] = None if budget is not None: if shared_budget is None: shared_budget = budget // 2 split_budget = budget - shared_budget encoders: Dict[str, nn.Module] = {} latent_spaces: Dict[str, Space] = {} for key, subspace in input_space.spaces.items(): dimension_input_dim = flatdim(subspace) # TODO: Each output will be a Box for now, and each dimension will have # a number of 'dedicated' features in the 'output space' that will be # proportional to their size in the input space. dimension_output_dim = round(dimension_input_dim / total_input_dims * hidden_dims) dimension_output_dim = max(dimension_output_dim, 1) dimension_output_dim = min(dimension_output_dim, total_output_dims - (n_inputs - 1)) assert 0 < dimension_output_dim < total_output_dims dimension_latent_space: Space = spaces.Box( 0, 1, shape=[dimension_output_dim]) latent_spaces[key] = dimension_latent_space # The 'budget', in number of parameters, that gets allotted for the # encoding of this dimension. dimension_budget = None if split_budget is not None: # The dimension gets a portion of the budget based on the proportion # of its input space compared to the total. dimension_budget = round(dimension_input_dim / total_input_dims * split_budget) else: dimension_budget = None encoders[key] = create_encoder(subspace, output_space=dimension_latent_space, budget=dimension_budget, **split.get(key, {})) # Encoder that processes each input separately and produces a "latent space" # for each input dimension. (dict input, dict output). split_encoders_module = DictEncoder(encoders) actual_split_params = n_parameters(split_encoders_module) if split_budget is not None: if actual_split_params > split_budget: warnings.warn( RuntimeWarning( f"The budget for the 'split' portion of the encoder was " f"{split_budget} parameters, but somehow the constructed " f"module has {actual_split_params} parameters!")) # Operation that 'concatenates' all the hidden spaces together. concat_layer = LambdaModule( f=lambda d: torch.cat(list(d.values()), dim=-1)) latent_dims = sum(map(flatdim, latent_spaces.values())) fused_latent_space = spaces.Box( -np.inf, np.inf, shape=[latent_dims], ) assert latent_dims == hidden_dims, "The sum of latent spaces didn't equal the prescribed hidden dims?" shared_module = create_encoder( fused_latent_space, output_space=output_space, budget=budget, **shared, ) return nn.Sequential( split_encoders_module, concat_layer, shared_module, )
def get_number_inputs(self): return flatdim(self.env.observation_space)
def get_number_outputs(self): return flatdim(self.env.action_space)
def test_with_controllable_episode_lengths(batch_size: int, monkeypatch): """ TODO: Test out the PolicyHead in a very controlled environment, where we know exactly the lengths of each episode. """ env = FakeEnvironment( partial(gym.make, "CartPole-v0"), batch_size=batch_size, episode_lengths=[5, *(10 for _ in range(batch_size - 1))], new_episode_length=lambda env_index: 10, ) env = AddDoneToObservation(env) env = ConvertToFromTensors(env) env = EnvDataset(env) obs_space = env.single_observation_space x_dim = flatdim(obs_space["x"]) # Create some dummy encoder. encoder = nn.Linear(x_dim, x_dim) representation_space = obs_space["x"] output_head = PolicyHead( input_space=representation_space, action_space=env.single_action_space, reward_space=env.single_reward_space, hparams=PolicyHead.HParams( max_episode_window_length=100, min_episodes_before_update=1, accumulate_losses_before_backward=False, ), ) # TODO: Simulating as if the output head were attached to a BaselineModel. PolicyHead.base_model_optimizer = torch.optim.Adam( output_head.parameters(), lr=1e-3 ) # Simplify the loss function so we know exactly what the loss should be at # each step. def mock_policy_gradient( rewards: Sequence[float], log_probs: Sequence[float], gamma: float = 0.95 ) -> Optional[Loss]: log_probs = (log_probs - log_probs.clone()) + 1 # Return the length of the episode, but with a "gradient" flowing back into log_probs. return len(rewards) * log_probs.mean() monkeypatch.setattr(output_head, "policy_gradient", mock_policy_gradient) batch_size = env.batch_size obs = env.reset() step_done = np.zeros(batch_size, dtype=np.bool) for step in range(200): x, obs_done = obs["x"], obs["done"] # The done from the obs should always be the same as the 'done' from the 'step' function. assert np.array_equal(obs_done, step_done) representations = encoder(x) observations = ContinualRLSetting.Observations(x=x, done=obs_done,) actions_obj = output_head(observations, representations) actions = actions_obj.y_pred # TODO: kinda useless to wrap a single tensor in an object.. forward_pass = ForwardPass( observations=observations, representations=representations, actions=actions, ) obs, rewards, step_done, info = env.step(actions) rewards_obj = ContinualRLSetting.Rewards(y=rewards) loss = output_head.get_loss( forward_pass=forward_pass, actions=actions_obj, rewards=rewards_obj, ) print(f"Step {step}") print(f"num episodes since update: {output_head.num_episodes_since_update}") print(f"steps left in episode: {env.steps_left_in_episode}") print(f"Loss for that step: {loss}") if any(obs_done): assert loss != 0.0 if step == 5.0: # Env 0 first episode from steps 0 -> 5 assert loss.loss == 5.0 assert loss.metrics["gradient_usage"].used_gradients == 5.0 assert loss.metrics["gradient_usage"].wasted_gradients == 0.0 elif step == 10: # Envs[1:batch_size], first episode, from steps 0 -> 10 # NOTE: At this point, both envs have reached the required number of episodes. # This means that the gradient usage on the next time any env reaches # an end-of-episode will be one less than the total number of items. assert loss.loss == 10.0 * (batch_size - 1) assert loss.metrics["gradient_usage"].used_gradients == 10.0 * ( batch_size - 1 ) assert loss.metrics["gradient_usage"].wasted_gradients == 0.0 elif step == 15: # Env 0 second episode from steps 5 -> 15 assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 elif step == 20: # Envs[1:batch_size]: second episode, from steps 0 -> 10 # NOTE: At this point, both envs have reached the required number of episodes. # This means that the gradient usage on the next time any env reaches # an end-of-episode will be one less than the total number of items. assert loss.loss == 10.0 * (batch_size - 1) assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1) assert loss.metrics["gradient_usage"].wasted_gradients == 1 * ( batch_size - 1 ) elif step == 25: # Env 0 third episode from steps 5 -> 15 assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 elif step > 0 and step % 10 == 0: # Same pattern as step 20 above assert loss.loss == 10.0 * (batch_size - 1), step assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1) assert loss.metrics["gradient_usage"].wasted_gradients == 1 * ( batch_size - 1 ) elif step > 0 and step % 5 == 0: # Same pattern as step 25 above assert loss.loss == 10.0 assert loss.metrics["gradient_usage"].used_gradients == 4 assert loss.metrics["gradient_usage"].wasted_gradients == 6 else: assert loss.loss == 0.0, step
def test_flatdim(space, flatdim): dim = utils.flatdim(space) assert dim == flatdim, "Expected {} to equal {}".format(dim, flatdim)