Esempio n. 1
0
def test_flatdim(base_space: gym.Space):
    sparse_space = Sparse(base_space, sparsity=0.)

    base_flat_dims = flatdim(base_space)
    sparse_flat_dims = flatdim(sparse_space)

    assert base_flat_dims == sparse_flat_dims
Esempio n. 2
0
    def __init__(self,
                 env,
                 expert_trajs=None,
                 discrim_arch=relu_net,
                 discrim_arch_args={},
                 name='gail'):
        super(GAIL, self).__init__()
        self.dO = flatdim(env.observation_space)
        self.dU = flatdim(env.action_space)
        self.set_demos(expert_trajs)

        # build energy model
        with tf.variable_scope(name) as vs:
            # Should be batch_size x T x dO/dU
            self.obs_t = tf.placeholder(tf.float32, [None, self.dO],
                                        name='obs')
            self.act_t = tf.placeholder(tf.float32, [None, self.dU],
                                        name='act')
            self.labels = tf.placeholder(tf.float32, [None, 1], name='labels')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            obs_act = tf.concat([self.obs_t, self.act_t], axis=1)
            logits = discrim_arch(obs_act, **discrim_arch_args)
            self.predictions = tf.nn.sigmoid(logits)
            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                        labels=self.labels))
            self.step = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(
                self.loss)
            self._make_param_ops(vs)
Esempio n. 3
0
def unflatten(space, x):
    if isinstance(space, Box):
        return np.asarray(x, dtype=space.dtype).reshape(space.shape)
    elif isinstance(space, Discrete):
        return int(np.nonzero(x)[0][0])
    elif isinstance(space, MultiBinary):
        return np.asarray(x, dtype=space.dtype).reshape(space.shape)
    elif isinstance(space, MultiDiscrete):
        return np.asarray(x, dtype=space.dtype).reshape(space.shape)
    elif isinstance(space, Tuple):
        dims = [flatdim(s) for s in space.spaces]
        list_flattened = np.split(x, np.cumsum(dims)[:-1])
        list_unflattened = [
            unflatten(s, flattened)
            for flattened, s in zip(list_flattened, space.spaces)
        ]
        return tuple(list_unflattened)
    elif isinstance(space, Dict):
        dims = [flatdim(s) for s in space.spaces.values()]
        list_flattened = np.split(x, np.cumsum(dims)[:-1])
        list_unflattened = [
            (key, unflatten(s, flattened))
            for flattened, (key,
                            s) in zip(list_flattened, space.spaces.items())
        ]
        return dict(list_unflattened)
    else:
        raise NotImplementedError
Esempio n. 4
0
    def __init__(self, observation_space, action_space):
        self.observation_space = observation_space
        self.action_space = action_space

        self.item_dim = flatdim(self.observation_space['equipped_items']) + \
            flatdim(self.observation_space['inventory'])

        self.actor = Actor(self.action_space.nvec, self.item_dim).to(device)
        self.critic = Critic(self.item_dim).to(device)
        self.discriminator = Discriminator(self.action_space.nvec,
                                           self.item_dim).to(device)

        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=LEARNING_RATE)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=LEARNING_RATE)
        self.discriminator_optim = torch.optim.Adam(
            self.discriminator.parameters(), lr=LEARNING_RATE)

        self.mse_loss = nn.MSELoss(reduction='none')
        self.bce_loss = nn.BCELoss()

        self.last_lp = None

        self.memory = []
Esempio n. 5
0
def test_flatdim(base_space: gym.Space):
    sparse_space = Sparse(base_space, sparsity=0.)

    base_flat_dims = flatdim(base_space)
    sparse_flat_dims = flatdim(sparse_space)
    assert base_flat_dims == sparse_flat_dims

    # The flattened dimensions shouldn't depend on the sparsity.
    sparse_space = Sparse(base_space, sparsity=1.)
    sparse_flat_dims = flatdim(sparse_space)
    assert base_flat_dims == sparse_flat_dims
Esempio n. 6
0
    def __init__(self,
                 env,
                 expert_trajs=None,
                 discrim_arch=relu_net,
                 discrim_arch_args={},
                 l2_reg=0,
                 discount=1.0,
                 max_itrs=100,
                 name='gcl'):
        super(AIRLStateAction, self).__init__()
        self.dO = flatdim(env.observation_space)
        self.dU = flatdim(env.action_space)
        self.set_demos(expert_trajs)
        self.max_itrs = max_itrs

        # build energy model
        with tf.variable_scope(name) as _vs:
            # Should be batch_size x T x dO/dU
            self.obs_t = tf.placeholder(tf.float32, [None, self.dO],
                                        name='obs')
            self.act_t = tf.placeholder(tf.float32, [None, self.dU],
                                        name='act')
            self.labels = tf.placeholder(tf.float32, [None, 1], name='labels')
            self.lprobs = tf.placeholder(tf.float32, [None, 1],
                                         name='log_probs')
            self.lr = tf.placeholder(tf.float32, (), name='lr')

            obs_act = tf.concat([self.obs_t, self.act_t], axis=1)
            with tf.variable_scope('discrim') as dvs:
                with tf.variable_scope('energy'):
                    self.energy = discrim_arch(obs_act, **discrim_arch_args)
                # we do not learn a separate log Z(s) because it is impossible to separate from the energy
                # In a discrete domain we can explicitly normalize to calculate log Z(s)
                log_p_tau = -self.energy
                discrim_vars = tf.get_collection('reg_vars', scope=dvs.name)

            log_q_tau = self.lprobs

            if l2_reg > 0:
                reg_loss = l2_reg * tf.reduce_sum(
                    [tf.reduce_sum(tf.square(var)) for var in discrim_vars])
            else:
                reg_loss = 0

            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.d_tau = tf.exp(log_p_tau - log_pq)
            cent_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                        (1 - self.labels) *
                                        (log_q_tau - log_pq))

            self.loss = cent_loss + reg_loss
            self.step = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(
                self.loss)
            self._make_param_ops(_vs)
Esempio n. 7
0
def multidiscrete_encoder(input_space: spaces.MultiDiscrete,
                          output_space: Space,
                          budget: int = None) -> nn.Module:
    # if input_space.shape[0]
    input_dims = flatdim(input_space)
    output_dims = flatdim(output_space)
    assert isinstance(output_space,
                      spaces.Box), "Only support Box output spaces for now."
    return nn.Sequential(
        LambdaModule(lambda v: v.reshape([-1, input_dims]).to(torch.float)),
        nn.Linear(input_dims, output_dims),
    )
Esempio n. 8
0
    def forward(self, observations: ContinualRLSetting.Observations,
                representations: Tensor) -> PolicyHeadOutput:
        """ Forward pass of a Policy head.

        TODO: Do we actually need the observations here? It is here so we have
        access to the 'done' from the env, but do we really need it here? or
        would there be another (cleaner) way to do this?
        """
        if len(representations.shape) < 2:
            # Flatten the representations.
            representations = representations.reshape(
                [-1, flatdim(self.input_space)])

        # Setup the buffers, which will hold the most recent observations,
        # actions and rewards within the current episode for each environment.
        if not self.batch_size:
            self.batch_size = representations.shape[0]
            self.create_buffers()

        representations = representations.float()

        logits = self.dense(representations)

        # The policy is the distribution over actions given the current state.
        action_dist = Categorical(logits=logits)
        sample = action_dist.sample()
        actions = PolicyHeadOutput(
            y_pred=sample,
            logits=logits,
            action_dist=action_dist,
        )
        return actions
Esempio n. 9
0
    def __init__(
        self,
        num_agents: int,
        action_spaces: List[Space],
        observation_spaces: List[Space],
        gamma: float,
        **kwargs
    ):
        """Constructor of base agent for Q-Learning

        Initializes basic variables of MARL agents
        namely epsilon, learning rate and discount rate.

        :param num_agents (int): number of agents
        :param action_spaces (List[Space]): action spaces of the environment for each agent
        :param observation_spaces (List[Space]): observation spaces of the environment for each agent
        :param gamma (float): discount factor (gamma)

        :attr n_acts (List[int]): number of actions for each agent
        """

        self.num_agents = num_agents
        self.action_spaces = action_spaces
        self.observation_spaces = observation_spaces
        self.n_acts = [flatdim(action_space) for action_space in action_spaces]

        self.gamma: float = gamma
Esempio n. 10
0
    def __init__(self, learning_rate: float =0.5, epsilon: float =1.0, **kwargs):
        """Constructor of JointActionLearning

        :param learning_rate (float): learning rate for Q-learning updates
        :param epsilon (float): epsilon value for all agents

        :attr q_tables (List[DefaultDict]): tables for Q-values mapping (OBS, ACT) pairs of
            observations and joint actions to respective Q-values for all agents
        :attr models (List[DefaultDict[DefaultDict]]): each agent holding model of other agent
            mapping observation to other agent actions to count of other agent action

        Initializes some variables of the Joint Action Learning agents, namely the epsilon, discount
        rate and learning rate
        """

        super().__init__(**kwargs)
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.n_acts = [flatdim(action_space) for action_space in self.action_spaces]

        # initialise Q-tables for all agents
        self.q_tables: List[DefaultDict] = [defaultdict(lambda: 0) for _ in range(self.num_agents)]

        # initialise models for each agent mapping state to other agent actions to count of other agent action
        # in state
        self.models = [defaultdict(lambda: defaultdict(lambda: 0)) for _ in range(self.num_agents)] 

        # count observations - count for each agent
        self.c_obss = [defaultdict(lambda: 0) for _ in range(self.num_agents)]
Esempio n. 11
0
    def __init__(self, action_space: Space, obs_space: Space, gamma: float,
                 epsilon: float, **kwargs):
        """Constructor of base agent for Q-Learning

        Initializes basic variables of the Q-Learning agent
        namely the epsilon, learning rate and discount rate.

        :param action_space (int): action space of the environment
        :param obs_space (int): observation space of the environment
        :param gamma (float): discount factor (gamma)
        :param epsilon (float): epsilon for epsilon-greedy action selection

        :attr n_acts (int): number of actions
        :attr q_table (DefaultDict): table for Q-values mapping (OBS, ACT) pairs of observations
            and actions to respective Q-values
        """

        self.action_space = action_space
        self.obs_space = obs_space
        self.n_acts = flatdim(action_space)

        self.epsilon: float = epsilon
        self.gamma: float = gamma

        self.q_table: DefaultDict = defaultdict(lambda: 0)

        self.epsilon_initial = self.epsilon  ##### CHECK THIS
Esempio n. 12
0
def test_flatten_space_boxes(space):
    flat_space = utils.flatten_space(space)
    assert isinstance(flat_space,
                      Box), f"Expected {type(flat_space)} to equal {Box}"
    flatdim = utils.flatdim(space)
    (single_dim, ) = flat_space.shape
    assert single_dim == flatdim, f"Expected {single_dim} to equal {flatdim}"
Esempio n. 13
0
def test_flatten_space_boxes(space):
    flat_space = utils.flatten_space(space)
    assert isinstance(flat_space, Box), "Expected {} to equal {}".format(
        type(flat_space), Box)
    flatdim = utils.flatdim(space)
    (single_dim, ) = flat_space.shape
    assert single_dim == flatdim, "Expected {} to equal {}".format(
        single_dim, flatdim)
Esempio n. 14
0
def box_encoder(input_space: spaces.Box,
                output_space: Space,
                budget: int = None,
                hidden_dims: List[int] = None,
                **kwargs) -> nn.Module:
    input_dims = flatdim(input_space)
    output_dims = flatdim(output_space)

    assert isinstance(output_space,
                      spaces.Box), "only support box output shape for now."

    if is_image_space(input_space):
        return image_encoder(input_space,
                             output_space,
                             budget=budget,
                             **kwargs)

    if hidden_dims is None:
        if budget is not None:
            # There are, in total, this many parameters, as a function of the input
            # size, hidden size, output size, and number of layers.
            # Would be cool if we could determine the hidden_dims given the budget.
            # n_params = (
            #     input_dims * hidden_dims + hidden_dims + # first dense layer
            #     (hidden_dims * hidden_dims + hidden_dims) * (n_layers - 2) + # Hidden layers
            #     hidden_dims * output_dims + hidden_dims # Output layer
            # )
            n_layers = 3
            hidden_dim = round(math.sqrt(budget // n_layers))
            hidden_dims = [hidden_dim for _ in range(n_layers)]
        else:
            hidden_dims = [
                64,
                64,
                64,
            ]

    return MLP(
        input_dims,
        *hidden_dims,
        output_dims=output_dims,
    )
Esempio n. 15
0
def discrete_encoder(input_space: spaces.Discrete,
                     output_space: Space,
                     budget: int = None,
                     **kwargs) -> nn.Module:
    # Just tile / copy the input value into a tensor.
    assert isinstance(output_space,
                      spaces.Box), "Only support box output space for now."
    assert output_space.dtype == np.float32, output_space.dtype
    output_dims = flatdim(output_space)
    return Tile(output_dims)
    return LambdaModule(
        lambda v: torch.empty(output_space.shape, dtype=torch.float).fill_(v))
Esempio n. 16
0
    def __init__(
        self,
        agent_id,
        obs_space,
        action_space,
        lr,
        adam_eps,
        recurrent_policy,
        num_steps,
        num_processes,
        device,
    ):
        self.agent_id = agent_id
        self.obs_size = flatdim(obs_space)
        self.action_size = flatdim(action_space)
        self.obs_space = obs_space
        self.action_space = action_space

        self.model = Policy(
            obs_space,
            action_space,
            base_kwargs={"recurrent": recurrent_policy},
        )

        self.storage = RolloutStorage(
            obs_space,
            action_space,
            self.model.recurrent_hidden_state_size,
            num_steps,
            num_processes,
        )

        self.model.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr, eps=adam_eps)

        # self.intr_stats = RunningStats()
        self.saveables = {
            "model": self.model,
            "optimizer": self.optimizer,
        }
Esempio n. 17
0
 def __init__(self,
              input_space: gym.Space,
              action_space: gym.Space,
              reward_space: gym.Space = None,
              hparams: "OutputHead.HParams" = None,
              name: str = ""):
     super().__init__()
     
     self.input_space = input_space
     self.action_space = action_space
     self.reward_space = reward_space or spaces.Box(-np.inf, np.inf, ())
     self.input_size = flatdim(input_space)
     self.hparams = hparams or self.HParams()
     if not isinstance(self.hparams, self.HParams):
         # Upgrade the hparams to the right type, if needed.
         self.hparams = self.upgrade_hparams()
     self.name = name or type(self).name
Esempio n. 18
0
def create_shmem_space(multiproc_context, space):
    if np.any([
            isinstance(space, t)
            for t in [Box, Discrete, MultiBinary, MultiDiscrete]
    ]):
        return multiproc_context.Array(_NP_TO_CT[space.dtype], flatdim(space))
    elif isinstance(space, Dict):

        class TempStruct(ctypes.Structure):
            _fields_ = [(key,
                         create_shmem_observation(multiproc_context, subspace))
                        for key, subspace in space.spaces.items()]

        return TempStruct()
    elif isinstance(space, Tuple):

        class TempStruct(ctypes.Structure):
            _fields_ = [(i,
                         create_shmem_observation(multiproc_context, subspace))
                        for i, subspace in enumerate(space.spaces)]

        return TempStruct()
    else:
        raise NotImplementedError
Esempio n. 19
0
def test_flatten_dim(space):
    sample = utils.flatten(space, space.sample())
    (single_dim, ) = sample.shape
    flatdim = utils.flatdim(space)
    assert single_dim == flatdim, "Expected {} to equal {}".format(
        single_dim, flatdim)
Esempio n. 20
0
def test_flatten_dim(space):
    sample = utils.flatten(space, space.sample())
    (single_dim,) = sample.shape
    flatdim = utils.flatdim(space)
    assert single_dim == flatdim, f"Expected {single_dim} to equal {flatdim}"
Esempio n. 21
0
def test_flatdim(space, flatdim):
    dim = utils.flatdim(space)
    assert dim == flatdim, f"Expected {dim} to equal {flatdim}"
Esempio n. 22
0
    def __init__(self, setting: SettingType, hparams: HParams, config: Config):
        super().__init__()
        self.setting: SettingType = setting
        self.hp: BaseModel.HParams = hparams

        self.Observations: Type[Observations] = setting.Observations
        self.Actions: Type[Actions] = setting.Actions
        self.Rewards: Type[Rewards] = setting.Rewards

        self.observation_space: gym.Space = setting.observation_space
        self.action_space: gym.Space = setting.action_space
        self.reward_space: gym.Space = setting.reward_space

        self.input_shape  = self.observation_space[0].shape
        self.reward_shape = self.reward_space.shape

        self.split_batch_transform = SplitBatch(observation_type=self.Observations,
                                                reward_type=self.Rewards)
        self.config: Config = config
        # TODO: Decided to Not set this property, so the trainer doesn't
        # fallback to using it instead of the passed datamodules/dataloaders.
        # self.datamodule: LightningDataModule = setting

        # (Testing) Setting this attribute is supposed to help with ddp/etc
        # training in pytorch-lightning. Not 100% sure.
        # self.example_input_array = torch.rand(self.batch_size, *self.input_shape)

        # Create the encoder and the output head.
        # Space of our encoder representations.
        self.representation_space: gym.Space
        if isinstance(setting, ContinualRLSetting) and setting.observe_state_directly:
            # ISSUE # 62: Need to add a dense network instead of no encoder, and
            # change the PolicyHead to have only one layer.
            # Only pass the image, not the task labels to the encoder (for now).
            input_dims = flatdim(self.observation_space[0])
            output_dims = self.hp.new_hidden_size or 128
            
            self.encoder = FCNet(
                in_features=input_dims,
                out_features=output_dims,
                hidden_layers=3,
                hidden_neurons=[256, 128, output_dims],
                activation=nn.ReLU,
            )
            self.representation_space = add_tensor_support(
                spaces.Box(low=-np.inf, high=np.inf, shape=[output_dims])
            )
            self.hidden_size = output_dims
        else:
            # TODO: Refactor this 'make_encoder' being on the hparams, its a bit
            # weird.
            self.encoder, self.hidden_size = self.hp.make_encoder()
            # TODO: Check that the outputs of the encoders are actually
            # flattened. I'm not sure they all are, which case the samples
            # wouldn't match with this space. 
            self.representation_space = spaces.Box(-np.inf, np.inf, (self.hidden_size,), np.float32)
        
        logger.info(f"Moving encoder to device {self.config.device}")
        self.encoder = self.encoder.to(self.config.device)
        
        self.representation_space = add_tensor_support(self.representation_space)
        self.output_head: OutputHead = self.create_output_head(setting, task_id=None)
Esempio n. 23
0
    def update_anchor_weights(self, new_task_id: int) -> None:
        """Update the FIMs and other EWC params before starting training on a new task.

        Parameters
        ----------
        new_task_id : int
            The ID of the new task.
        """
        # we dont want to go here at test time.
        # NOTE: We also switch between unknown tasks.
        logger.info(
            f"Updating the EWC 'anchor' weights before starting training on "
            f"task {new_task_id}")
        self.previous_model_weights = self.get_current_model_weights().clone(
        ).detach()

        # Create a Dataloader from the stored observations.
        obs_type: Type[Observations] = type(self.observation_collector[0])
        dataset = [obs.as_namedtuple() for obs in self.observation_collector]
        # Or, alternatively (see the note below on why we don't use this):
        # stacked_observations: Observations = obs_type.stack(self.observation_collector)
        # dataset = TensorDataset(*stacked_observations.as_namedtuple())

        # NOTE: This is equivalent to just using the same batch size as during
        # training, as each Observations in the list is already a batch.
        # NOTE: We keep the same batch size here as during training because for
        # instance in RL, it would be weird to suddenly give some new batch size,
        # since the buffers would get cleared and re-created just for these forward
        # passes
        dataloader = DataLoader(dataset, batch_size=None, collate_fn=None)
        # TODO: Would be nice to have a progress bar here.

        # Create the parameters to be passed to the FIM function. These may vary a
        # bit, depending on if we're being applied in a classification setting or in
        # a regression setting (not done yet)
        variant: str
        # TODO: Change this conditional to be based on the type of action space, rather
        # than of output head.
        if isinstance(self._model.output_head, ClassificationHead):
            variant = "classif_logits"
            n_output = self._model.action_space.n

            def fim_function(*inputs) -> Tensor:
                observations = obs_type(*inputs).to(self._model.device)
                forward_pass: ForwardPass = self._model(observations)
                actions = forward_pass.actions
                return actions.logits

        elif isinstance(self._model.output_head, RegressionHead):
            # NOTE: This hasn't been tested yet.
            variant = "regression"
            n_output = flatdim(self._model.action_space)

            def fim_function(*inputs) -> Tensor:
                observations = obs_type(*inputs).to(self._model.device)
                forward_pass: ForwardPass = self._model(observations)
                actions = forward_pass.actions
                return actions.y_pred

        else:
            raise NotImplementedError("TODO")

        with self._ignoring_task_boundaries():
            # Prevent recursive calls to `on_task_switch` from affecting us (can be
            # called from MultiheadModel). (TODO: MultiheadModel will be fixed soon.)
            # layer_collection = LayerCollection.from_model(self.model.shared_modules())
            # nngeometry BUG: this doesn't work when passing the layer
            # collection instead of the model
            new_fim = FIM(
                model=self.model.shared_modules(),
                loader=dataloader,
                representation=self.options.fim_representation,
                n_output=n_output,
                variant=variant,
                function=fim_function,
                device=self._model.device,
                layer_collection=None,
            )

        # TODO: There was maybe an idea to use another fisher information matrix for
        # the critic in A2C, but not doing that atm.
        new_fims = [new_fim]
        self.consolidate(new_fims, task=new_task_id)
        self.observation_collector.clear()
Esempio n. 24
0
from tensorflow.python.keras.callbacks import TensorBoard

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory

ENV_NAME = 'card_game-v0'

# Get the environment and extract the number of actions.
tensor_board = TensorBoard('/tmp/tensor')
env = gym.make(ENV_NAME)
nb_actions = env.action_space.n

# Model
obs_dims = flatdim(env.observation_space)

model = Sequential()
model.add(Flatten(input_shape=(1, obs_dims)))
model.add(Dense(24))
model.add(Activation('relu'))
model.add(Dense(12))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Agent
# gamma = 0.95
# epsilon = 1.0
# epsilon_min = 0.01
Esempio n. 25
0
    def on_task_switch(self, task_id: Optional[int]):
        """ Executed when the task switches (to either a known or unknown task).
        """
        if not self.enabled:
            return

        logger.info(f"On task switch called: task_id={task_id}")

        if self._shared_net is None:
            logger.info(
                f"On task switch called: task_id={task_id}, EWC cannot be "
                f"applied as there are no shared weights.")

        elif self.previous_task is None and self.n_switches == 0 and not task_id:
            self.previous_task = task_id
            logger.info("Starting the first task, no EWC update.")
            self.n_switches += 1

        elif self._model.training and (task_id is None
                                       or task_id > self.previous_task):
            # we dont want to go here at test time.
            # NOTE: We also switch between unknown tasks.
            logger.info(f"Switching tasks: {self.previous_task} -> {task_id}: "
                        f"Updating the EWC 'anchor' weights.")
            self.previous_task = task_id
            device = self._model.config.device
            self.previous_model_weights = (PVector.from_model(
                self._shared_net.to(device)).clone().detach())

            # Create a Dataloader from the stored observations.
            obs_type: Type[Observations] = type(self.observation_collector[0])
            dataset = [
                obs.as_namedtuple() for obs in self.observation_collector
            ]
            # Or, alternatively (see the note below on why we don't use this):
            # stacked_observations: Observations = obs_type.stack(self.observation_collector)
            # dataset = TensorDataset(*stacked_observations.as_namedtuple())

            # NOTE: This is equivalent to just using the same batch size as during
            # training, as each Observations in the list is already a batch.
            # NOTE: We keep the same batch size here as during training because for
            # instance in RL, it would be weird to suddenly give some new batch size,
            # since the buffers would get cleared and re-created just for these forward
            # passes
            dataloader = DataLoader(dataset, batch_size=None, collate_fn=None)

            # Create the parameters to be passed to the FIM function. These may vary a
            # bit, depending on if we're being applied in a classification setting or in
            # a regression setting (not done yet)
            variant: str
            if isinstance(self._model.output_head, ClassificationHead):
                variant = "classif_logits"
                n_output = self._model.action_space.n

                def fim_function(*inputs) -> Tensor:
                    observations = obs_type(*inputs).to(self._model.device)
                    forward_pass: ForwardPass = self._model(observations)
                    actions = forward_pass.actions
                    return actions.logits

            elif isinstance(self._model.output_head, RegressionHead):
                # NOTE: This hasn't been tested yet.
                variant = "regression"
                n_output = flatdim(self._model.action_space)

                def fim_function(*inputs) -> Tensor:
                    observations = obs_type(*inputs).to(self._model.device)
                    forward_pass: ForwardPass = self._model(observations)
                    actions = forward_pass.actions
                    return actions.y_pred

            else:
                raise NotImplementedError("TODO")

            new_fim = FIM(
                model=self._shared_net,
                loader=dataloader,
                representation=self.options.fim_representation,
                n_output=n_output,
                variant=variant,
                function=fim_function,
                device=self._model.device,
            )

            # TODO: There was maybe an idea to use another fisher information matrix for
            # the critic in A2C, but not doing that atm.
            new_fims = [new_fim]
            self.consolidate(new_fims, task=self.previous_task)
            self.n_switches += 1
            self.observation_collector.clear()
Esempio n. 26
0
def dict_encoder(input_space: spaces.Dict,
                 output_space: Space,
                 budget: int = None,
                 shared_budget: int = None,
                 hidden_dims: int = 512,
                 split: Dict[str, Any] = None,
                 shared: Dict[str, Any] = None,
                 **kwargs) -> nn.Module:
    """ IDEA: Create an encoder for each item in the dict, mapping from the
    corresponding input space to some kind of latent space, and then add a
    flatten/concat layer, then map to the provided output space.
    
    shared_budget: The budget for the shared portion of the network. Must be
    less than the `budget`. If only `budget` is given, the shared budget is set
    to 1/2 of the total budget. 
    
    """
    split = split or {}
    shared = shared or {}
    if kwargs:
        warnings.warn(
            RuntimeWarning(
                f"Ignoring kwargs {kwargs}! (This acceps 'split' and 'shared' to "
                f"hold the hparams of the split and shared portions of the "
                f"encoder)."))

    total_input_dims = flatdim(input_space)
    total_output_dims = flatdim(output_space)
    n_inputs = len(input_space.spaces)

    if not isinstance(output_space, spaces.Box):
        raise NotImplementedError("Only support Box output spaces for now.")

    split_budget: Optional[int] = None
    shared_budget: Optional[int] = None

    if budget is not None:
        if shared_budget is None:
            shared_budget = budget // 2
        split_budget = budget - shared_budget

    encoders: Dict[str, nn.Module] = {}
    latent_spaces: Dict[str, Space] = {}

    for key, subspace in input_space.spaces.items():
        dimension_input_dim = flatdim(subspace)
        # TODO: Each output will be a Box for now, and each dimension will have
        # a number of 'dedicated' features in the 'output space' that will be
        # proportional to their size in the input space.
        dimension_output_dim = round(dimension_input_dim / total_input_dims *
                                     hidden_dims)
        dimension_output_dim = max(dimension_output_dim, 1)
        dimension_output_dim = min(dimension_output_dim,
                                   total_output_dims - (n_inputs - 1))
        assert 0 < dimension_output_dim < total_output_dims

        dimension_latent_space: Space = spaces.Box(
            0, 1, shape=[dimension_output_dim])
        latent_spaces[key] = dimension_latent_space

        # The 'budget', in number of parameters, that gets allotted for the
        # encoding of this dimension.
        dimension_budget = None
        if split_budget is not None:
            # The dimension gets a portion of the budget based on the proportion
            # of its input space compared to the total.
            dimension_budget = round(dimension_input_dim / total_input_dims *
                                     split_budget)
        else:
            dimension_budget = None

        encoders[key] = create_encoder(subspace,
                                       output_space=dimension_latent_space,
                                       budget=dimension_budget,
                                       **split.get(key, {}))

    # Encoder that processes each input separately and produces a "latent space"
    # for each input dimension. (dict input, dict output).
    split_encoders_module = DictEncoder(encoders)
    actual_split_params = n_parameters(split_encoders_module)

    if split_budget is not None:
        if actual_split_params > split_budget:
            warnings.warn(
                RuntimeWarning(
                    f"The budget for the 'split' portion of the encoder was "
                    f"{split_budget} parameters, but somehow the constructed "
                    f"module has {actual_split_params} parameters!"))

    # Operation that 'concatenates' all the hidden spaces together.
    concat_layer = LambdaModule(
        f=lambda d: torch.cat(list(d.values()), dim=-1))
    latent_dims = sum(map(flatdim, latent_spaces.values()))
    fused_latent_space = spaces.Box(
        -np.inf,
        np.inf,
        shape=[latent_dims],
    )

    assert latent_dims == hidden_dims, "The sum of latent spaces didn't equal the prescribed hidden dims?"

    shared_module = create_encoder(
        fused_latent_space,
        output_space=output_space,
        budget=budget,
        **shared,
    )
    return nn.Sequential(
        split_encoders_module,
        concat_layer,
        shared_module,
    )
Esempio n. 27
0
 def get_number_inputs(self):
     return flatdim(self.env.observation_space)
Esempio n. 28
0
 def get_number_outputs(self):
     return flatdim(self.env.action_space)
Esempio n. 29
0
def test_with_controllable_episode_lengths(batch_size: int, monkeypatch):
    """ TODO: Test out the PolicyHead in a very controlled environment, where we
    know exactly the lengths of each episode.
    """
    env = FakeEnvironment(
        partial(gym.make, "CartPole-v0"),
        batch_size=batch_size,
        episode_lengths=[5, *(10 for _ in range(batch_size - 1))],
        new_episode_length=lambda env_index: 10,
    )
    env = AddDoneToObservation(env)
    env = ConvertToFromTensors(env)
    env = EnvDataset(env)

    obs_space = env.single_observation_space
    x_dim = flatdim(obs_space["x"])
    # Create some dummy encoder.
    encoder = nn.Linear(x_dim, x_dim)
    representation_space = obs_space["x"]

    output_head = PolicyHead(
        input_space=representation_space,
        action_space=env.single_action_space,
        reward_space=env.single_reward_space,
        hparams=PolicyHead.HParams(
            max_episode_window_length=100,
            min_episodes_before_update=1,
            accumulate_losses_before_backward=False,
        ),
    )
    # TODO: Simulating as if the output head were attached to a BaselineModel.
    PolicyHead.base_model_optimizer = torch.optim.Adam(
        output_head.parameters(), lr=1e-3
    )

    # Simplify the loss function so we know exactly what the loss should be at
    # each step.

    def mock_policy_gradient(
        rewards: Sequence[float], log_probs: Sequence[float], gamma: float = 0.95
    ) -> Optional[Loss]:
        log_probs = (log_probs - log_probs.clone()) + 1
        # Return the length of the episode, but with a "gradient" flowing back into log_probs.
        return len(rewards) * log_probs.mean()

    monkeypatch.setattr(output_head, "policy_gradient", mock_policy_gradient)

    batch_size = env.batch_size

    obs = env.reset()
    step_done = np.zeros(batch_size, dtype=np.bool)

    for step in range(200):
        x, obs_done = obs["x"], obs["done"]

        # The done from the obs should always be the same as the 'done' from the 'step' function.
        assert np.array_equal(obs_done, step_done)

        representations = encoder(x)
        observations = ContinualRLSetting.Observations(x=x, done=obs_done,)

        actions_obj = output_head(observations, representations)
        actions = actions_obj.y_pred

        # TODO: kinda useless to wrap a single tensor in an object..
        forward_pass = ForwardPass(
            observations=observations, representations=representations, actions=actions,
        )
        obs, rewards, step_done, info = env.step(actions)

        rewards_obj = ContinualRLSetting.Rewards(y=rewards)
        loss = output_head.get_loss(
            forward_pass=forward_pass, actions=actions_obj, rewards=rewards_obj,
        )
        print(f"Step {step}")
        print(f"num episodes since update: {output_head.num_episodes_since_update}")
        print(f"steps left in episode: {env.steps_left_in_episode}")
        print(f"Loss for that step: {loss}")

        if any(obs_done):
            assert loss != 0.0

        if step == 5.0:
            # Env 0 first episode from steps 0 -> 5
            assert loss.loss == 5.0
            assert loss.metrics["gradient_usage"].used_gradients == 5.0
            assert loss.metrics["gradient_usage"].wasted_gradients == 0.0
        elif step == 10:
            # Envs[1:batch_size], first episode, from steps 0 -> 10
            # NOTE: At this point, both envs have reached the required number of episodes.
            # This means that the gradient usage on the next time any env reaches
            # an end-of-episode will be one less than the total number of items.
            assert loss.loss == 10.0 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].used_gradients == 10.0 * (
                batch_size - 1
            )
            assert loss.metrics["gradient_usage"].wasted_gradients == 0.0
        elif step == 15:
            # Env 0 second episode from steps 5 -> 15
            assert loss.loss == 10.0
            assert loss.metrics["gradient_usage"].used_gradients == 4
            assert loss.metrics["gradient_usage"].wasted_gradients == 6

        elif step == 20:
            # Envs[1:batch_size]: second episode, from steps 0 -> 10
            # NOTE: At this point, both envs have reached the required number of episodes.
            # This means that the gradient usage on the next time any env reaches
            # an end-of-episode will be one less than the total number of items.
            assert loss.loss == 10.0 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].wasted_gradients == 1 * (
                batch_size - 1
            )

        elif step == 25:
            # Env 0 third episode from steps 5 -> 15
            assert loss.loss == 10.0
            assert loss.metrics["gradient_usage"].used_gradients == 4
            assert loss.metrics["gradient_usage"].wasted_gradients == 6

        elif step > 0 and step % 10 == 0:
            # Same pattern as step 20 above
            assert loss.loss == 10.0 * (batch_size - 1), step
            assert loss.metrics["gradient_usage"].used_gradients == 9 * (batch_size - 1)
            assert loss.metrics["gradient_usage"].wasted_gradients == 1 * (
                batch_size - 1
            )

        elif step > 0 and step % 5 == 0:
            # Same pattern as step 25 above
            assert loss.loss == 10.0
            assert loss.metrics["gradient_usage"].used_gradients == 4
            assert loss.metrics["gradient_usage"].wasted_gradients == 6

        else:
            assert loss.loss == 0.0, step
Esempio n. 30
0
def test_flatdim(space, flatdim):
    dim = utils.flatdim(space)
    assert dim == flatdim, "Expected {} to equal {}".format(dim, flatdim)