Esempio n. 1
0
def normalize_state(body, state):
    '''
    Normalizes one or more states using a running mean and standard deviation
    Details of the normalization from Deep RL Bootcamp, L6
    https://www.youtube.com/watch?v=8EcdaCk9KaQ&feature=youtu.be
    '''
    same_shape = False if type(
        state) == list else state.shape == body.state_mean.shape
    has_preprocess = getattr(body.memory, 'preprocess_state', False)
    if ('Atari' in util.get_class_name(body.memory)):
        # never normalize atari, it has its own normalization step
        logger.debug(
            'skipping normalizing for Atari, already handled by preprocess')
        return state
    elif ('Replay' in util.get_class_name(body.memory)) and has_preprocess:
        # normalization handled by preprocess_state function in the memory
        logger.debug('skipping normalizing, already handled by preprocess')
        return state
    elif same_shape:
        # if not atari, always normalize the state the first time we see it during act
        # if the shape is not transformed in some way
        if np.sum(body.state_std_dev) == 0:
            return np.clip(state - body.state_mean, -10, 10)
        else:
            return np.clip((state - body.state_mean) / body.state_std_dev, -10,
                           10)
    else:
        # broadcastable sample from an un-normalized memory so we should normalize
        logger.debug('normalizing sample from memory')
        if np.sum(body.state_std_dev) == 0:
            return np.clip(state - body.state_mean, -10, 10)
        else:
            return np.clip((state - body.state_mean) / body.state_std_dev, -10,
                           10)
Esempio n. 2
0
    def calc_q_loss(self, batch):
        '''Compute the Q value loss using predicted and target Q values from the appropriate networks'''
        states = batch['states']
        next_states = batch['next_states']
        q_preds = self.net(states)
        with torch.no_grad():
            # Use online_net to select actions in next state
            online_next_q_preds = self.online_net(next_states)
            # Use eval_net to calculate next_q_preds for actions chosen by online_net
            next_q_preds = self.eval_net(next_states)
        act_q_preds = q_preds.gather(
            -1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
        online_actions = online_next_q_preds.argmax(dim=-1, keepdim=True)
        max_next_q_preds = next_q_preds.gather(-1, online_actions).squeeze(-1)
        max_q_targets = batch['rewards'] + self.gamma * (
            1 - batch['dones']) * max_next_q_preds
        logger.debug(
            f'act_q_preds: {act_q_preds}\nmax_q_targets: {max_q_targets}')
        q_loss = self.net.loss_fn(act_q_preds, max_q_targets)

        # TODO use the same loss_fn but do not reduce yet
        if 'Prioritized' in util.get_class_name(self.body.memory):  # PER
            errors = (max_q_targets - act_q_preds.detach()).abs().cpu().numpy()
            self.body.memory.update_priorities(errors)
        return q_loss
Esempio n. 3
0
def init_global_nets(algorithm):
    '''
    Initialize global_nets for Hogwild using an identical instance of an algorithm from an isolated Session
    in spec.meta.distributed, specify either:
    - 'shared': global network parameter is shared all the time. In this mode, algorithm local network will be replaced directly by global_net via overriding by identify attribute name
    - 'synced': global network parameter is periodically synced to local network after each gradient push. In this mode, algorithm will keep a separate reference to `global_{net}` for each of its network
    '''
    dist_mode = algorithm.agent.spec['meta']['distributed']
    assert dist_mode in ('shared', 'synced'), f'Unrecognized distributed mode'
    global_nets = {}
    for net_name in algorithm.net_names:
        optim_name = net_name.replace('net', 'optim')
        if not hasattr(
                algorithm,
                optim_name):  # only for trainable network, i.e. has an optim
            continue
        g_net = getattr(algorithm, net_name)
        g_net.share_memory()  # make net global
        if dist_mode == 'shared':  # use the same name to override the local net
            global_nets[net_name] = g_net
        else:  # keep a separate reference for syncing
            global_nets[f'global_{net_name}'] = g_net
        # if optim is Global, set to override the local optim and its scheduler
        optim = getattr(algorithm, optim_name)
        if 'Global' in util.get_class_name(optim):
            optim.share_memory()  # make optim global
            global_nets[optim_name] = optim
            lr_scheduler_name = net_name.replace('net', 'lr_scheduler')
            lr_scheduler = getattr(algorithm, lr_scheduler_name)
            global_nets[lr_scheduler_name] = lr_scheduler
    logger.info(
        f'Initialized global_nets attr {list(global_nets.keys())} for Hogwild')
    return global_nets
Esempio n. 4
0
    def index_lab_comp(self, lab_comp):
        '''
        Update info space coor when initializing lab component, and return its coor and index.
        Does not apply to AEB entities.
        @returns {tuple, int} data_coor, index
        @example

        class Session:
            def __init__(self, spec):
                self.coor, self.index = info_space.index_lab_comp(self)
        '''
        axis = util.get_class_name(lab_comp, lower=True)
        self.advance_coor(axis)
        coor = self.coor.copy()
        index = coor[axis]
        return coor, index
Esempio n. 5
0
    def calc_q_loss(self, batch):
        '''Compute the Q value loss using predicted and target Q values from the appropriate networks'''
        q_preds = self.net.wrap_eval(batch['states'])
        act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
        next_q_preds = self.net.wrap_eval(batch['next_states'])
        # Bellman equation: compute max_q_targets using reward and max estimated Q values (0 if no next_state)
        max_next_q_preds, _ = next_q_preds.max(dim=-1, keepdim=True)
        max_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * max_next_q_preds
        max_q_targets = max_q_targets.detach()
        q_loss = self.net.loss_fn(act_q_preds, max_q_targets)

        # TODO use the same loss_fn but do not reduce yet
        if 'Prioritized' in util.get_class_name(self.body.memory):  # PER
            errors = torch.abs(max_q_targets - act_q_preds.detach())
            self.body.memory.update_priorities(errors)
        return q_loss
Esempio n. 6
0
    def space_train(self):
        '''
        Completes one training step for the agent if it is time to train.
        i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
        Each training step consists of sampling n batches from the agent's memory.
        For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
        Otherwise this function does nothing.
        '''
        if util.get_lab_mode() == 'enjoy':
            return np.nan
        total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
        self.to_train = (total_t > self.training_min_timestep
                         and total_t % self.training_frequency == 0)
        is_per = util.get_class_name(
            self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay'
        if self.to_train == 1:
            total_loss = torch.tensor(0.0, device=self.net.device)
            for _ in range(self.training_epoch):
                batch = self.space_sample()
                for _ in range(self.training_batch_epoch):
                    with torch.no_grad():
                        q_targets = self.calc_q_targets(batch)
                        if is_per:
                            q_preds = self.net.wrap_eval(batch['states'])
                            errors = torch.abs(q_targets - q_preds)
                            errors = errors.sum(dim=1).unsqueeze_(dim=1)
                            for body in self.agent.nanflat_body_a:
                                body.memory.update_priorities(errors)
                    loss = self.net.training_step(
                        batch['states'],
                        q_targets,
                        global_net=self.global_nets.get('net'))
                    total_loss += loss
            loss = total_loss / (self.training_epoch *
                                 self.training_batch_epoch)
            # reset
            self.to_train = 0
            for body in self.agent.nanflat_body_a:
                body.entropies = []
                body.log_probs = []
            logger.debug(
                f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}'
            )

            return loss.item()
        else:
            return np.nan
Esempio n. 7
0
def init_params(module, init_fn):
    '''Initialize module's weights using init_fn, and biases to 0.0'''
    bias_init = 0.0
    classname = util.get_class_name(module)
    if 'Net' in classname:  # skip if it's a net, not pytorch layer
        pass
    elif any(k in classname for k in ('BatchNorm', 'Conv', 'Linear')):
        init_fn(module.weight)
        nn.init.constant_(module.bias, bias_init)
    elif 'GRU' in classname:
        for name, param in module.named_parameters():
            if 'weight' in name:
                init_fn(param)
            elif 'bias' in name:
                nn.init.constant_(param, bias_init)
    else:
        pass
Esempio n. 8
0
    def calc_q_loss(self, batch):
        '''Compute the Q value loss for Hydra network by apply the singleton logic on generalized aggregate.'''
        q_preds = torch.stack(self.net.wrap_eval(batch['states']))
        act_q_preds = q_preds.gather(-1, torch.stack(batch['actions']).long().unsqueeze(-1)).squeeze(-1)
        # Use online_net to select actions in next state
        online_next_q_preds = torch.stack(self.online_net.wrap_eval(batch['next_states']))
        # Use eval_net to calculate next_q_preds for actions chosen by online_net
        next_q_preds = torch.stack(self.eval_net.wrap_eval(batch['next_states']))
        max_next_q_preds = online_next_q_preds.gather(-1, next_q_preds.argmax(dim=-1, keepdim=True)).squeeze(-1)
        max_q_targets = torch.stack(batch['rewards']) + self.gamma * (1 - torch.stack(batch['dones'])) * max_next_q_preds
        q_loss = self.net.loss_fn(act_q_preds, max_q_targets)

        # TODO use the same loss_fn but do not reduce yet
        for body in self.agent.nanflat_body_a:
            if 'Prioritized' in util.get_class_name(body.memory):  # PER
                errors = torch.abs(max_q_targets - act_q_preds)
                body.memory.update_priorities(errors)
        return q_loss
Esempio n. 9
0
    def calc_q_loss(self, batch):
        '''Compute the Q value loss using predicted and target Q values from the appropriate networks'''
        q_preds = self.net.wrap_eval(batch['states'])
        act_q_preds = q_preds.gather(-1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
        # Use online_net to select actions in next state
        online_next_q_preds = self.online_net.wrap_eval(batch['next_states'])
        # Use eval_net to calculate next_q_preds for actions chosen by online_net
        next_q_preds = self.eval_net.wrap_eval(batch['next_states'])
        max_next_q_preds = next_q_preds.gather(-1, online_next_q_preds.argmax(dim=-1, keepdim=True)).squeeze(-1)
        max_q_targets = batch['rewards'] + self.gamma * (1 - batch['dones']) * max_next_q_preds
        max_q_targets = max_q_targets.detach()
        q_loss = self.net.loss_fn(act_q_preds, max_q_targets)

        # TODO use the same loss_fn but do not reduce yet
        if 'Prioritized' in util.get_class_name(self.body.memory):  # PER
            errors = torch.abs(max_q_targets - act_q_preds.detach())
            self.body.memory.update_priorities(errors)
        return q_loss
Esempio n. 10
0
    def get_coor_idx(self, lab_comp):
        '''
        Get info space coor when initializing lab component, and return its coor and index.
        Does not apply to AEB entities.
        @returns {tuple, int} data_coor, index
        @example

        class Session:
            def __init__(self, spec):
                self.coor, self.index = info_space.get_coor_idx(self)

        info_space.tick('session')
        session = Session(spec, info_space)
        '''
        axis = util.get_class_name(lab_comp, lower=True)
        coor = self.coor.copy()
        index = coor[axis]
        return coor, index
Esempio n. 11
0
    def get_coor_idx(self, lab_comp):
        '''
        Get info space coor when initializing lab component, and return its coor and index.
        Does not apply to AEB entities.
        @returns {tuple, int} data_coor, index
        @example

        class Session:
            def __init__(self, spec):
                self.coor, self.index = info_space.get_coor_idx(self)

        info_space.tick('session')
        session = Session(spec, info_space)
        '''
        axis = util.get_class_name(lab_comp, lower=True)
        coor = self.coor.copy()
        index = coor[axis]
        return coor, index
Esempio n. 12
0
def update_online_stats(body, state):
    '''
    Method to calculate the running mean and standard deviation of the state space.
    See https://www.johndcook.com/blog/standard_deviation/ for more details
    for n >= 1
        M_n = M_n-1 + (state - M_n-1) / n
        S_n = S_n-1 + (state - M_n-1) * (state - M_n)
        variance = S_n / (n - 1)
        std_dev = sqrt(variance)
    '''
    logger.debug(
        f'mean: {body.state_mean}, std: {body.state_std_dev}, num examples: {body.state_n}'
    )
    # Assumes only one state is given
    if ('Atari' in util.get_class_name(body.memory)):
        assert state.ndim == 3
    elif getattr(body.memory, 'raw_state_dim', False):
        assert state.size == body.memory.raw_state_dim
    else:
        assert state.size == body.state_dim or state.shape == body.state_dim
    mean = body.state_mean
    body.state_n += 1
    if np.isnan(mean).any():
        assert np.isnan(body.state_std_dev_int)
        assert np.isnan(body.state_std_dev)
        body.state_mean = state
        body.state_std_dev_int = 0
        body.state_std_dev = 0
    else:
        assert body.state_n > 1
        body.state_mean = mean + (state - mean) / body.state_n
        body.state_std_dev_int = body.state_std_dev_int + (state - mean) * (
            state - body.state_mean)
        body.state_std_dev = np.sqrt(body.state_std_dev_int /
                                     (body.state_n - 1))
        # Guard against very small std devs
        if (body.state_std_dev < 1e-8).any():
            body.state_std_dev[np.where(body.state_std_dev < 1e-8)] += 1e-8
    logger.debug(
        f'new mean: {body.state_mean}, new std: {body.state_std_dev}, num examples: {body.state_n}'
    )
Esempio n. 13
0
def init_parameters(module, init_fn):
    '''
    Initializes module's weights using init_fn, which is the name of function from from nn.init
    Initializes module's biases to either 0.01 or 0.0, depending on module
    The only exception is BatchNorm layers, for which we use uniform initialization
    '''
    bias_init = 0.0
    classname = util.get_class_name(module)
    if 'BatchNorm' in classname:
        init_fn(module.weight)
        nn.init.constant_(module.bias, bias_init)
    elif 'GRU' in classname:
        for name, param in module.named_parameters():
            if 'weight' in name:
                init_fn(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0.0)
    elif 'Linear' in classname or ('Conv' in classname
                                   and 'Net' not in classname):
        init_fn(module.weight)
        nn.init.constant_(module.bias, bias_init)
Esempio n. 14
0
    def calc_q_loss(self, batch):
        '''Compute the Q value loss using predicted and target Q values from the appropriate networks'''
        states = batch['states']
        next_states = batch['next_states']
        q_preds = self.net(states)
        with torch.no_grad():
            next_q_preds = self.net(next_states)
        act_q_preds = q_preds.gather(
            -1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
        # Bellman equation: compute max_q_targets using reward and max estimated Q values (0 if no next_state)
        max_next_q_preds, _ = next_q_preds.max(dim=-1, keepdim=False)
        max_q_targets = batch['rewards'] + self.gamma * (
            1 - batch['dones']) * max_next_q_preds
        logger.debug(
            f'act_q_preds: {act_q_preds}\nmax_q_targets: {max_q_targets}')
        q_loss = self.net.loss_fn(act_q_preds, max_q_targets)

        # TODO use the same loss_fn but do not reduce yet
        if 'Prioritized' in util.get_class_name(self.body.memory):  # PER
            errors = (max_q_targets - act_q_preds.detach()).abs().cpu().numpy()
            self.body.memory.update_priorities(errors)
        return q_loss
Esempio n. 15
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
     self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0)
     is_per = util.get_class_name(self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay'
     if self.to_train == 1:
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             batch = self.sample()
             for _ in range(self.training_batch_epoch):
                 with torch.no_grad():
                     q_targets = self.calc_q_targets(batch)
                     if is_per:
                         q_preds = self.net.wrap_eval(batch['states'])
                         errors = torch.abs(q_targets - q_preds)
                         errors = errors.sum(dim=1).unsqueeze_(dim=1)
                         for body in self.agent.nanflat_body_a:
                             body.memory.update_priorities(errors)
                 loss = self.net.training_step(batch['states'], q_targets)
                 total_loss += loss.cpu()
         loss = total_loss / (self.training_epoch * self.training_batch_epoch)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
Esempio n. 16
0
 def is_discrete(self, a):
     '''Check if an agent (brain) is subject to discrete actions'''
     assert a == 0, 'OpenAI Gym supports only single body, use a=0'
     return util.get_class_name(self.action_spaces[a]) != 'Box'  # continuous
Esempio n. 17
0
 def try_update_per(self, q_preds, q_targets):
     if 'Prioritized' in util.get_class_name(self.body.memory):  # PER
         with torch.no_grad():
             errors = (q_preds - q_targets).abs().cpu().numpy()
         self.body.memory.update_priorities(errors)
Esempio n. 18
0
 def _is_discrete(self, action_space):
     '''Check if an action space is discrete'''
     return util.get_class_name(action_space) != 'Box'