Ejemplo n.º 1
0
 def training_step(self, x=None, y=None, loss=None, retain_graph=False, global_net=None):
     '''
     Takes a single training step: one forward and one backwards pass
     More most RL usage, we have custom, often complicated, loss functions. Compute its value and put it in a pytorch tensor then pass it in as loss
     '''
     self.train()
     self.zero_grad()
     self.optim.zero_grad()
     if loss is None:
         out = self(x)
         loss = self.loss_fn(out, y)
     assert not torch.isnan(loss).any(), loss
     if net_util.to_assert_trained():
         # to accommodate split model in inherited classes
         model = getattr(self, 'model', None) or getattr(self, 'model_body')
         assert_trained = net_util.gen_assert_trained(model)
     loss.backward(retain_graph=retain_graph)
     if self.clip_grad:
         logger.debug(f'Clipping gradient: {self.clip_grad_val}')
         torch.nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
     if global_net is None:
         self.optim.step()
     else:  # distributed training with global net
         net_util.push_global_grad(self, global_net)
         self.optim.step()
         net_util.pull_global_param(self, global_net)
     if net_util.to_assert_trained():
         model = getattr(self, 'model', None) or getattr(self, 'model_body')
         assert_trained(model, loss)
     logger.debug(f'Net training_step loss: {loss}')
     return loss
Ejemplo n.º 2
0
    def train_shared(self):
        '''
        Trains the network when the actor and critic share parameters
        loss = self.policy_loss_coef * policy_loss + self.val_loss_coef * val_loss
        '''
        if self.to_train == 1:
            batch = self.sample()
            with torch.no_grad():
                advs, v_targets = self.calc_advs_v_targets(batch)
            policy_loss = self.calc_policy_loss(batch, advs)  # from actor
            val_loss = self.calc_val_loss(batch, v_targets)  # from critic
            loss = policy_loss + val_loss
            self.net.training_step(loss=loss,
                                   global_net=self.global_nets.get('net'))
            # reset
            self.to_train = 0
            self.body.entropies = []
            self.body.log_probs = []
            logger.debug(
                f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}'
            )

            return loss.item()
        else:
            return np.nan
Ejemplo n.º 3
0
def test_logger(test_str):
    logger.critical(test_str)
    logger.debug(test_str)
    logger.error(test_str)
    logger.exception(test_str)
    logger.info(test_str)
    logger.warning(test_str)
Ejemplo n.º 4
0
def calc_log_probs(algorithm, net, body, batch):
    '''
    Method to calculate log_probs fresh from batch data
    Body already stores log_prob from self.net. This is used for PPO where log_probs needs to be recalculated.
    '''
    states, actions = batch['states'], batch['actions']
    action_dim = body.action_dim
    is_multi_action = ps.is_iterable(action_dim)
    # construct log_probs for each state-action
    pdparams = algorithm.calc_pdparam(states, net=net)
    pdparams = guard_multi_pdparams(pdparams, body)
    assert len(pdparams) == len(states), f'batch_size of pdparams: {len(pdparams)} vs states: {len(states)}'

    pdtypes = ACTION_PDS[body.action_type]
    ActionPD = getattr(distributions, body.action_pdtype)

    log_probs = []
    for idx, pdparam in enumerate(pdparams):
        if not is_multi_action:  # already cloned  for multi_action above
            pdparam = pdparam.clone()  # clone for grad safety
        _action, action_pd = sample_action_pd(ActionPD, pdparam, body)
        log_probs.append(action_pd.log_prob(actions[idx].float()).sum(dim=0))
    log_probs = torch.stack(log_probs)
    assert not torch.isnan(log_probs).any(), f'log_probs: {log_probs}, \npdparams: {pdparams} \nactions: {actions}'
    logger.debug(f'log_probs: {log_probs}')
    return log_probs
Ejemplo n.º 5
0
def get_session_data(session):
    '''
    Gather data from session: MDP, Agent, Env data, hashed by aeb; then aggregate.
    @returns {dict, dict} session_mdp_data, session_data
    '''
    data_names = AGENT_DATA_NAMES + ENV_DATA_NAMES
    mdp_data_names = ['t', 'epi'] + data_names
    agg_data_names = ['epi'] + list(DATA_AGG_FNS.keys())
    data_h_v_dict = {data_name: session.aeb_space.get_history_v(data_name) for data_name in data_names}
    session_mdp_data, session_data = {}, {}
    for aeb in session.aeb_space.aeb_list:
        data_h_dict = {data_name: data_h_v[aeb] for data_name, data_h_v in data_h_v_dict.items()}
        # trim back to remove any incomplete sessions due to multienv termination
        complete_done_h = np.trim_zeros(data_h_dict['done'], 'b')
        # offset properly to bin separate episodes
        reset_bin = np.concatenate([[0.], complete_done_h[:-1]])
        data_len = len(reset_bin)
        reset_idx = reset_bin.astype('bool')
        nonreset_idx = ~reset_idx
        data_h_dict['t'] = np.ones(reset_idx.shape)
        data_h_dict['epi'] = reset_idx.astype(int).cumsum()
        mdp_df = pd.DataFrame({
            data_name: data_h_dict[data_name][:data_len]
            for data_name in mdp_data_names})
        mdp_df = mdp_df.reindex(mdp_data_names, axis=1)
        aeb_df = mdp_df[agg_data_names].groupby('epi').agg(DATA_AGG_FNS)
        aeb_df.reset_index(drop=False, inplace=True)
        session_mdp_data[aeb], session_data[aeb] = mdp_df, aeb_df
    logger.debug(f'{session_data}')
    data_size_in_bytes = util.memory_size(session_mdp_data)
    logger.debug(f'Size of session data: {data_size_in_bytes} MB')
    if data_size_in_bytes > 25:
        logger.warn(f'Session data > 25 MB')
    return session_mdp_data, session_data
Ejemplo n.º 6
0
 def update_lr(self):
     assert 'lr' in self.optim_param
     old_lr = self.optim_param['lr']
     self.optim_param['lr'] = old_lr * 0.9
     logger.debug(
         f'Learning rate decayed from {old_lr} to {self.optim_param["lr"]}')
     self.optim = net_util.get_optim_multinet(self.params, self.optim_param)
Ejemplo n.º 7
0
 def training_step(self,
                   xs=None,
                   ys=None,
                   loss=None,
                   retain_graph=False,
                   lr_clock=None):
     '''
     Takes a single training step: one forward and one backwards pass. Both x and y are lists of the same length, one x and y per environment
     '''
     self.lr_scheduler.step(epoch=ps.get(lr_clock, 'total_t'))
     self.train()
     self.optim.zero_grad()
     if loss is None:
         outs = self(xs)
         total_loss = torch.tensor(0.0, device=self.device)
         for out, y in zip(outs, ys):
             loss = self.loss_fn(out, y)
             total_loss += loss
         loss = total_loss
     assert not torch.isnan(loss).any(), loss
     if net_util.to_assert_trained():
         assert_trained = net_util.gen_assert_trained(self)
     loss.backward(retain_graph=retain_graph)
     if self.clip_grad_val is not None:
         nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
     self.optim.step()
     if net_util.to_assert_trained():
         assert_trained(self, loss)
         self.store_grad_norms()
     logger.debug(f'Net training_step loss: {loss}')
     return loss
Ejemplo n.º 8
0
 def calc_gae_advs_v_targets(self, batch, v_preds):
     '''
     Calculate GAE, and advs = GAE, v_targets = advs + v_preds
     See GAE from Schulman et al. https://arxiv.org/pdf/1506.02438.pdf
     '''
     next_states = batch['next_states'][-1]
     if not self.body.env.is_venv:
         next_states = next_states.unsqueeze(dim=0)
     with torch.no_grad():
         next_v_pred = self.calc_v(next_states, use_cache=False)
     v_preds = v_preds.detach()  # adv does not accumulate grad
     if self.body.env.is_venv:
         v_preds = math_util.venv_pack(v_preds, self.body.env.num_envs)
         next_v_pred = next_v_pred.unsqueeze(dim=0)
     v_preds_all = torch.cat((v_preds, next_v_pred), dim=0)
     advs = math_util.calc_gaes(batch['rewards'], batch['dones'],
                                v_preds_all, self.gamma, self.lam)
     v_targets = advs + v_preds
     advs = math_util.standardize(
         advs)  # standardize only for advs, not v_targets
     if self.body.env.is_venv:
         advs = math_util.venv_unpack(advs)
         v_targets = math_util.venv_unpack(v_targets)
     logger.debug(f'advs: {advs}\nv_targets: {v_targets}')
     return advs, v_targets
Ejemplo n.º 9
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_iter):
             batch = self.sample()
             clock.set_batch_size(len(batch))
             for _ in range(self.training_batch_iter):
                 loss = self.calc_q_loss(batch)
                 self.net.train_step(loss,
                                     self.optim,
                                     self.lr_scheduler,
                                     clock=clock,
                                     global_net=self.global_net)
                 total_loss += loss
         loss = total_loss / (self.training_iter * self.training_batch_iter)
         # reset
         self.to_train = 0
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
Ejemplo n.º 10
0
    def train_shared(self):
        '''
        Trains the network when the actor and critic share parameters
        '''
        if self.to_train == 1:
            # update old net
            net_util.copy(self.net, self.old_net)
            batch = self.sample()
            total_loss = torch.tensor(0.0, device=self.net.device)
            for _ in range(self.training_epoch):
                with torch.no_grad():
                    advs, v_targets = self.calc_advs_v_targets(batch)
                policy_loss = self.calc_policy_loss(batch, advs)  # from actor
                val_loss = self.calc_val_loss(batch, v_targets)  # from critic
                loss = policy_loss + val_loss
                # retain for entropies etc.
                self.net.training_step(loss=loss, retain_graph=True, global_net=self.global_nets.get('net'))
                total_loss += loss
            loss = total_loss / self.training_epoch
            # reset
            self.to_train = 0
            self.body.entropies = []
            self.body.log_probs = []
            logger.debug(f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}')

            return loss.item()
        else:
            return np.nan
Ejemplo n.º 11
0
    def calc_q_loss(self, batch):
        '''Compute the Q value loss using predicted and target Q values from the appropriate networks'''
        states = batch['states']
        next_states = batch['next_states']
        q_preds = self.net(states)
        with torch.no_grad():
            # Use online_net to select actions in next state
            online_next_q_preds = self.online_net(next_states)
            # Use eval_net to calculate next_q_preds for actions chosen by online_net
            next_q_preds = self.eval_net(next_states)
        act_q_preds = q_preds.gather(
            -1, batch['actions'].long().unsqueeze(-1)).squeeze(-1)
        online_actions = online_next_q_preds.argmax(dim=-1, keepdim=True)
        max_next_q_preds = next_q_preds.gather(-1, online_actions).squeeze(-1)
        max_q_targets = batch['rewards'] + self.gamma * (
            1 - batch['dones']) * max_next_q_preds
        logger.debug(
            f'act_q_preds: {act_q_preds}\nmax_q_targets: {max_q_targets}')
        q_loss = self.net.loss_fn(act_q_preds, max_q_targets)

        # TODO use the same loss_fn but do not reduce yet
        if 'Prioritized' in util.get_class_name(self.body.memory):  # PER
            errors = (max_q_targets - act_q_preds.detach()).abs().cpu().numpy()
            self.body.memory.update_priorities(errors)
        return q_loss
Ejemplo n.º 12
0
def get_session_data(session):
    '''
    Gather data from session: MDP, Agent, Env data, hashed by aeb; then aggregate.
    @returns {dict, dict} session_mdp_data, session_data
    '''
    data_names = AGENT_DATA_NAMES + ENV_DATA_NAMES
    mdp_data_names = ['t', 'epi'] + data_names
    agg_data_names = ['epi'] + list(DATA_AGG_FNS.keys())
    data_h_v_dict = {data_name: session.aeb_space.get_history_v(data_name) for data_name in data_names}
    session_mdp_data, session_data = {}, {}
    for aeb in session.aeb_space.aeb_list:
        data_h_dict = {data_name: data_h_v[aeb] for data_name, data_h_v in data_h_v_dict.items()}
        # trim back to remove any incomplete sessions due to multienv termination
        complete_done_h = np.trim_zeros(data_h_dict['done'], 'b')
        # offset properly to bin separate episodes
        reset_bin = np.concatenate([[0.], complete_done_h[:-1]])
        data_len = len(reset_bin)
        reset_idx = reset_bin.astype('bool')
        nonreset_idx = ~reset_idx
        data_h_dict['t'] = np.ones(reset_idx.shape)
        data_h_dict['epi'] = reset_idx.astype(int).cumsum()
        mdp_df = pd.DataFrame({
            data_name: data_h_dict[data_name][:data_len]
            for data_name in mdp_data_names})
        mdp_df = mdp_df.reindex(mdp_data_names, axis=1)
        aeb_df = mdp_df[agg_data_names].groupby('epi').agg(DATA_AGG_FNS)
        aeb_df.reset_index(drop=False, inplace=True)
        session_mdp_data[aeb], session_data[aeb] = mdp_df, aeb_df
    logger.debug(f'{session_data}')
    data_size_in_bytes = util.memory_size(session_mdp_data)
    logger.debug(f'Size of session data: {data_size_in_bytes} MB')
    if data_size_in_bytes > 25:
        logger.warn(f'Session data > 25 MB')
    return session_mdp_data, session_data
Ejemplo n.º 13
0
 def space_train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     if util.in_eval_lab_modes():
         self.body.flush()
         return np.nan
     clock = self.body.env.clock  # main clock
     tick = util.s_get(self, 'aeb_space.clock').get(clock.max_tick_unit)
     self.to_train = (tick > self.training_start_step and tick % self.training_frequency == 0)
     if self.to_train == 1:
         total_loss = torch.tensor(0.0, device=self.net.device)
         for _ in range(self.training_epoch):
             batch = self.space_sample()
             for _ in range(self.training_batch_epoch):
                 loss = self.calc_q_loss(batch)
                 self.net.training_step(loss=loss, lr_clock=clock)
                 total_loss += loss
         loss = total_loss / (self.training_epoch * self.training_batch_epoch)
         # reset
         self.to_train = 0
         for body in self.agent.nanflat_body_a:
             body.flush()
         logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
         return loss.item()
     else:
         return np.nan
Ejemplo n.º 14
0
 def init_nets(self):
     '''Initialize nets with multi-task dimensions, and set net params'''
     # NOTE: Separate init from MultitaskDQN despite similarities so that this implementation can support arbitrary sized state and action heads (e.g. multiple layers)
     net_spec = self.agent.spec['net']
     if len(net_spec['hid_layers']) > 0:
         state_head_out_d = int(net_spec['hid_layers'][0] / 4)
     else:
         state_head_out_d = 16
     self.state_dims = [
         [body.state_dim, state_head_out_d] for body in self.agent.nanflat_body_a]
     self.action_dims = [
         [body.action_dim] for body in self.agent.nanflat_body_a]
     self.total_state_dim = sum([s[0] for s in self.state_dims])
     self.total_action_dim = sum([a[0] for a in self.action_dims])
     logger.debug(
         f'State dims: {self.state_dims}, total: {self.total_state_dim}')
     logger.debug(
         f'Action dims: {self.action_dims}, total: {self.total_action_dim}')
     net_kwargs = util.compact_dict(dict(
         hid_layers_activation=_.get(net_spec, 'hid_layers_activation'),
         optim_param=_.get(net_spec, 'optim'),
         loss_param=_.get(net_spec, 'loss'),
         clamp_grad=_.get(net_spec, 'clamp_grad'),
         clamp_grad_val=_.get(net_spec, 'clamp_grad_val'),
     ))
     self.net = getattr(net, net_spec['type'])(
         self.state_dims, net_spec['hid_layers'], self.action_dims, **net_kwargs)
     self.target_net = getattr(net, net_spec['type'])(
         self.state_dims, net_spec['hid_layers'], self.action_dims, **net_kwargs)
     self.online_net = self.target_net
     self.eval_net = self.target_net
     util.set_attr(self, _.pick(net_spec, [
         'batch_size', 'update_type', 'update_frequency', 'polyak_weight',
     ]))
Ejemplo n.º 15
0
def test_logger(test_multiline_str):
    logger.critical(test_multiline_str)
    logger.debug(test_multiline_str)
    logger.error(test_multiline_str)
    logger.exception(test_multiline_str)
    logger.info(test_multiline_str)
    logger.warn(test_multiline_str)
Ejemplo n.º 16
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
     self.to_train = (total_t > self.training_min_timestep
                      and total_t % self.training_frequency == 0)
     if self.to_train == 1:
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             batch = self.sample()
             for _ in range(self.training_batch_epoch):
                 with torch.no_grad():
                     q_targets = self.calc_q_targets(batch)
                 loss = self.net.training_step(batch['states'], q_targets)
                 total_loss += loss
         loss = total_loss / (self.training_epoch *
                              self.training_batch_epoch)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 17
0
def get_session_data(session):
    '''Gather data from session: MDP, Agent, Env data, and form session_data.'''
    aeb_space = session.aeb_space
    data_names = AGENT_DATA_NAMES + ENV_DATA_NAMES
    agg_data_names = ['epi'] + list(DATA_AGG_FNS.keys())
    data_h_v_dict = {
        data_name: aeb_space.get_history_v(data_name)
        for data_name in data_names
    }
    session_df_data = {}
    session_data = {}
    for aeb in aeb_space.aeb_list:
        data_h_dict = {
            data_name: data_h_v[aeb]
            for data_name, data_h_v in data_h_v_dict.items()
        }
        reset_idx = np.isnan(data_h_dict['done'])
        nonreset_idx = ~reset_idx
        epi_h = reset_idx.astype(int).cumsum()
        t_h = np.ones(reset_idx.shape)
        data_h_dict['epi'] = epi_h
        data_h_dict['t'] = t_h
        df = pd.DataFrame({
            data_name: data_h_dict[data_name][nonreset_idx]
            for data_name in ['epi', 't'] + data_names
        })
        aeb_df = df[agg_data_names].groupby('epi').agg(DATA_AGG_FNS)
        aeb_df.reset_index(drop=False, inplace=True)
        # TODO save full data to db
        session_df_data[aeb] = df
        session_data[aeb] = aeb_df
    logger.debug(f'{session_data}')
    return session_data
Ejemplo n.º 18
0
def normalize_states_and_next_states(body, batch, episodic_flag=None):
    '''
    Convenience function for normalizing the states and next states in a batch of data
    '''
    logger.debug(f'states: {batch["states"]}')
    logger.debug(f'next states: {batch["next_states"]}')
    episodic = episodic_flag if episodic_flag is not None else body.memory.is_episodic
    logger.debug(
        f'Episodic: {episodic}, episodic_flag: {episodic_flag}, body.memory: {body.memory.is_episodic}'
    )
    if episodic:
        normalized = []
        for epi in batch['states']:
            normalized.append(normalize_state(body, epi))
        batch['states'] = normalized
        normalized = []
        for epi in batch['next_states']:
            normalized.append(normalize_state(body, epi))
        batch['next_states'] = normalized
    else:
        batch['states'] = normalize_state(body, batch['states'])
        batch['next_states'] = normalize_state(body, batch['next_states'])
    logger.debug(f'normalized states: {batch["states"]}')
    logger.debug(f'normalized next states: {batch["next_states"]}')
    return batch
Ejemplo n.º 19
0
    def calc_gae_advs_v_targets(self, batch):
        '''
        Calculate the GAE advantages and value targets for training actor and critic respectively
        adv_targets = GAE (see math_util method)
        v_targets = adv_targets + v_preds
        before output, adv_targets is standardized (so v_targets used the unstandardized version)
        Used for training with GAE
        '''
        v_preds = self.calc_v(batch['states'])
        # calc next_state boundary value and concat with above for efficiency
        next_v_pred_tail = self.calc_v(batch['next_states'][-1:])
        next_v_preds = torch.cat([v_preds[1:], next_v_pred_tail], dim=0)
        # ensure val for next_state is 0 at done
        next_v_preds = next_v_preds * (1 - batch['dones'])

        # v_targets = gae_targets + v_preds
        adv_targets = math_util.calc_gaes(batch['rewards'], v_preds,
                                          next_v_preds, self.gamma, self.lam)
        v_targets = adv_targets + v_preds
        if torch.cuda.is_available() and self.net.gpu:
            adv_targets = adv_targets.cuda()
            v_targets = v_targets.cuda()

        # standardization trick
        # guard nan std by setting to 0 and add small const
        adv_std = adv_targets.std()
        adv_std[adv_std != adv_std] = 0
        adv_std += 1e-08
        adv_targets = (adv_targets - adv_targets.mean()) / adv_std
        logger.debug(f'adv_targets: {adv_targets}\nv_targets: {v_targets}')
        return adv_targets, v_targets
Ejemplo n.º 20
0
    def train_separate(self):
        '''
        Trains the network when the actor and critic are separate networks
        '''
        clock = self.body.env.clock
        if self.to_train == 1:
            # onpolicy update
            super_loss = super(SIL, self).train_separate()
            # offpolicy sil update with random minibatch
            total_sil_loss = torch.tensor(0.0, device=self.net.device)
            for _ in range(self.training_epoch):
                batch = self.replay_sample()
                for _ in range(self.training_batch_epoch):
                    sil_policy_loss, sil_val_loss = self.calc_sil_policy_val_loss(
                        batch)
                    self.net.training_step(loss=sil_policy_loss,
                                           lr_clock=clock,
                                           retain_graph=True)
                    self.critic.training_step(loss=sil_val_loss,
                                              lr_clock=clock)
                    total_sil_loss += sil_policy_loss + sil_val_loss
            sil_loss = total_sil_loss / self.training_epoch
            loss = super_loss + sil_loss
            logger.debug(
                f'Trained {self.name} at epi: {clock.get("epi")}, total_t: {clock.get("total_t")}, t: {clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}'
            )

            return loss.item()
        else:
            return np.nan
Ejemplo n.º 21
0
 def training_step(self,
                   x=None,
                   y=None,
                   loss=None,
                   retain_graph=False,
                   lr_clock=None):
     '''
     Takes a single training step: one forward and one backwards pass
     More most RL usage, we have custom, often complicated, loss functions. Compute its value and put it in a pytorch tensor then pass it in as loss
     '''
     if hasattr(self, 'model_tails') and x is not None:
         raise ValueError(
             'Loss computation from x,y not supported for multitails')
     self.lr_scheduler.step(epoch=ps.get(lr_clock, 'total_t'))
     self.train()
     self.optim.zero_grad()
     if loss is None:
         out = self(x)
         loss = self.loss_fn(out, y)
     assert not torch.isnan(loss).any(), loss
     if net_util.to_assert_trained():
         assert_trained = net_util.gen_assert_trained(self)
     loss.backward(retain_graph=retain_graph)
     if self.clip_grad_val is not None:
         nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
     self.optim.step()
     if net_util.to_assert_trained():
         assert_trained(self, loss)
         self.store_grad_norms()
     logger.debug(f'Net training_step loss: {loss}')
     return loss
Ejemplo n.º 22
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     Otherwise this function does nothing.
     '''
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         batch = self.sample()
         clock.set_batch_size(len(batch))
         loss = self.calc_q_loss(batch)
         self.net.train_step(loss,
                             self.optim,
                             self.lr_scheduler,
                             clock=clock,
                             global_net=self.global_net)
         # reset
         self.to_train = 0
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
Ejemplo n.º 23
0
 def train(self):
     '''Train actor critic by computing the loss in batch efficiently'''
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         batch = self.sample()
         clock.set_batch_size(len(batch))
         pdparams, v_preds = self.calc_pdparam_v(batch)
         advs, v_targets = self.calc_advs_v_targets(batch, v_preds)
         policy_loss = self.calc_policy_loss(batch, pdparams, advs)  # from actor
         val_loss = self.calc_val_loss(v_preds, v_targets)  # from critic
         if self.shared:  # shared network
             loss = policy_loss + val_loss
             self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
         else:
             self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
             self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net)
             loss = policy_loss + val_loss
         # reset
         self.to_train = 0
         logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
         return loss.item()
     else:
         return np.nan
Ejemplo n.º 24
0
 def calc_q_targets(self, batch):
     '''Compute the target Q values for multitask network by iterating through the slices corresponding to bodies, and computing the singleton function'''
     q_preds = self.net.wrap_eval(batch['states'])
     # Use online_net to select actions in next state
     online_next_q_preds = self.online_net.wrap_eval(batch['next_states'])
     next_q_preds = self.eval_net.wrap_eval(batch['next_states'])
     start_idx = 0
     multi_q_targets = []
     # iterate over body, use slice with proper idx offset
     for b, body_batch in enumerate(batch['body_batches']):
         body = self.agent.nanflat_body_a[b]
         end_idx = start_idx + body.action_dim
         _, action_idxs = torch.max(online_next_q_preds[:,
                                                        start_idx:end_idx],
                                    dim=1)
         # Offset action index properly
         action_idxs += start_idx
         batch_size = len(body_batch['dones'])
         max_next_q_preds = next_q_preds[range(batch_size), action_idxs]
         max_q_targets = body_batch['rewards'] + self.gamma * (
             1 - body_batch['dones']) * max_next_q_preds
         max_q_targets.unsqueeze_(1)
         q_targets = (max_q_targets * body_batch['actions']) + (
             q_preds[:, start_idx:end_idx] * (1 - body_batch['actions']))
         multi_q_targets.append(q_targets)
         start_idx = end_idx
     q_targets = torch.cat(multi_q_targets, dim=1)
     logger.debug(f'q_targets: {q_targets}')
     return q_targets
Ejemplo n.º 25
0
Archivo: mlp.py Proyecto: tttor/SLM-Lab
 def training_step(self, xs=None, ys=None, loss=None, retain_graph=False):
     '''
     Takes a single training step: one forward and one backwards pass. Both x and y are lists of the same length, one x and y per environment
     '''
     self.train()
     self.zero_grad()
     self.optim.zero_grad()
     if loss is None:
         outs = self(xs)
         total_loss = torch.tensor(0.0)
         for out, y in zip(outs, ys):
             loss = self.loss_fn(out, y)
             total_loss += loss.cpu()
     assert not torch.isnan(total_loss).any()
     if net_util.to_assert_trained():
         assert_trained = net_util.gen_assert_trained(self.model_body)
     total_loss.backward(retain_graph=retain_graph)
     if self.clip_grad:
         logger.debug(f'Clipping gradient')
         torch.nn.utils.clip_grad_norm(self.parameters(),
                                       self.clip_grad_val)
     self.optim.step()
     if net_util.to_assert_trained():
         assert_trained(self.model_body)
     return total_loss
Ejemplo n.º 26
0
 def train_separate(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     if self.to_train == 1:
         batch = self.sample()
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             loss = self.calc_loss(batch)
             # to reuse loss for critic
             loss.backward = partial(loss.backward, retain_graph=True)
             self.net.training_step(loss=loss)
             # critic.optim.step using the same loss
             loss.backward = partial(loss.backward, retain_graph=False)
             self.critic.training_step(loss=loss)
             total_loss += loss
         loss = total_loss.mean()
         net_util.copy(self.net, self.old_net)
         net_util.copy(self.critic, self.old_critic)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss:.2f}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 27
0
 def training_step(self,
                   x=None,
                   y=None,
                   loss=None,
                   retain_graph=False,
                   global_net=None):
     '''Takes a single training step: one forward and one backwards pass'''
     self.train()
     self.zero_grad()
     self.optim.zero_grad()
     if loss is None:
         out = self(x)
         loss = self.loss_fn(out, y)
     assert not torch.isnan(loss).any(), loss
     if net_util.to_assert_trained():
         assert_trained = net_util.gen_assert_trained(self.rnn_model)
     loss.backward(retain_graph=retain_graph)
     if self.clip_grad:
         logger.debug(f'Clipping gradient: {self.clip_grad_val}')
         torch.nn.utils.clip_grad_norm_(self.parameters(),
                                        self.clip_grad_val)
     if global_net is None:
         self.optim.step()
     else:  # distributed training with global net
         net_util.push_global_grad(self, global_net)
         self.optim.step()
         net_util.pull_global_param(self, global_net)
     if net_util.to_assert_trained():
         assert_trained(self.rnn_model, loss)
     logger.debug(f'Net training_step loss: {loss}')
     return loss
Ejemplo n.º 28
0
 def training_step(self, xs=None, ys=None, loss=None, retain_graph=False, global_net=None):
     '''
     Takes a single training step: one forward and one backwards pass. Both x and y are lists of the same length, one x and y per environment
     '''
     self.train()
     self.zero_grad()
     self.optim.zero_grad()
     if loss is None:
         outs = self(xs)
         total_loss = torch.tensor(0.0, device=self.device)
         for out, y in zip(outs, ys):
             loss = self.loss_fn(out, y)
             total_loss += loss
         loss = total_loss
     assert not torch.isnan(loss).any(), loss
     if net_util.to_assert_trained():
         assert_trained = net_util.gen_assert_trained(self.model_body)
     loss.backward(retain_graph=retain_graph)
     if self.clip_grad:
         logger.debug(f'Clipping gradient: {self.clip_grad_val}')
         torch.nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
     if global_net is None:
         self.optim.step()
     else:  # distributed training with global net
         net_util.push_global_grad(self, global_net)
         self.optim.step()
         net_util.pull_global_param(self, global_net)
     if net_util.to_assert_trained():
         assert_trained(self.model_body, loss)
     logger.debug(f'Net training_step loss: {loss}')
     return loss
Ejemplo n.º 29
0
 def train_shared(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     clock = self.body.env.clock
     if self.to_train == 1:
         # update old net
         torch.cuda.empty_cache()
         net_util.copy(self.net, self.old_net)
         batch = self.sample()
         total_loss = torch.tensor(0.0, device=self.net.device)
         for _ in range(self.training_epoch):
             with torch.no_grad():
                 advs, v_targets = self.calc_advs_v_targets(batch)
             policy_loss = self.calc_policy_loss(batch, advs)  # from actor
             val_loss = self.calc_val_loss(batch, v_targets)  # from critic
             loss = policy_loss + val_loss
             # retain for entropies etc.
             self.net.training_step(loss=loss,
                                    lr_clock=clock,
                                    retain_graph=True)
             total_loss += loss
         loss = total_loss / self.training_epoch
         # reset
         self.to_train = 0
         self.body.flush()
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
Ejemplo n.º 30
0
 def compute_q_target_values(self, batch):
     '''Computes the target Q values for a batch of experiences. Note that the net references may differe based on algorithm.'''
     q_sts = self.net.wrap_eval(batch['states'])
     # Use act_select network to select actions in next state
     q_next_st_acts = self.online_net.wrap_eval(batch['next_states'])
     _val, q_next_acts = torch.max(q_next_st_acts, dim=1)
     logger.debug(f'Q next action: {q_next_acts.size()}')
     # Select q_next_st_maxs based on action selected in q_next_acts
     # Evaluate the action selection using the eval net
     q_next_sts = self.eval_net.wrap_eval(batch['next_states'])
     logger.debug(f'Q next_states: {q_next_sts.size()}')
     idx = torch.from_numpy(np.array(list(range(self.batch_size))))
     q_next_st_maxs = q_next_sts[idx, q_next_acts]
     q_next_st_maxs.unsqueeze_(1)
     logger.debug(f'Q next_states max {q_next_st_maxs.size()}')
     # Compute final q_target using reward and estimated best Q value from the next state if there is one
     # Make future reward 0 if the current state is done
     q_targets_max = batch['rewards'].data + self.gamma * \
         torch.mul((1 - batch['dones'].data), q_next_st_maxs)
     logger.debug(f'Q targets max: {q_targets_max.size()}')
     # We only want to train the network for the action selected
     # For all other actions we set the q_target = q_sts
     # So that the loss for these actions is 0
     q_targets = torch.mul(q_targets_max, batch['actions'].data) + \
         torch.mul(q_sts, (1 - batch['actions'].data))
     logger.debug(f'Q targets: {q_targets.size()}')
     return q_targets
Ejemplo n.º 31
0
 def train_shared(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     if self.to_train == 1:
         batch = self.sample()
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             with torch.no_grad():
                 advs, v_targets = self.calc_advs_v_targets(batch)
             policy_loss = self.calc_policy_loss(batch, advs)  # from actor
             val_loss = self.calc_val_loss(batch, v_targets)  # from critic
             loss = policy_loss + val_loss
             # retain for entropies etc.
             self.net.training_step(loss=loss, retain_graph=True)
             total_loss += loss.cpu()
         loss = total_loss / self.training_epoch
         net_util.copy(self.net, self.old_net)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss:.2f}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 32
0
def get_session_data(session):
    '''
    Gather data from session: MDP, Agent, Env data, hashed by aeb; then aggregate.
    @returns {dict, dict} session_mdp_data, session_data
    '''
    data_names = AGENT_DATA_NAMES + ENV_DATA_NAMES
    mdp_data_names = ['t', 'epi'] + data_names
    agg_data_names = ['epi'] + list(DATA_AGG_FNS.keys())
    data_h_v_dict = {
        data_name: session.aeb_space.get_history_v(data_name)
        for data_name in data_names
    }
    session_mdp_data, session_data = {}, {}
    for aeb in session.aeb_space.aeb_list:
        data_h_dict = {
            data_name: data_h_v[aeb]
            for data_name, data_h_v in data_h_v_dict.items()
        }
        # remove any incomplete session timesteps from tail (due to multienv termination)
        complete_done_h = np.trim_zeros(data_h_dict['done'], 'b')
        data_len = len(complete_done_h)
        reset_idx = np.isnan(complete_done_h)
        nonreset_idx = ~reset_idx
        data_h_dict['t'] = np.ones(reset_idx.shape)
        data_h_dict['epi'] = reset_idx.astype(int).cumsum()
        mdp_df = pd.DataFrame({
            data_name: data_h_dict[data_name][:data_len][nonreset_idx]
            for data_name in mdp_data_names
        })
        mdp_df = mdp_df.reindex(mdp_data_names, axis=1)
        aeb_df = mdp_df[agg_data_names].groupby('epi').agg(DATA_AGG_FNS)
        aeb_df.reset_index(drop=False, inplace=True)
        session_mdp_data[aeb], session_data[aeb] = mdp_df, aeb_df
    logger.debug(f'{session_data}')
    return session_mdp_data, session_data
Ejemplo n.º 33
0
def update_online_stats(body, state):
    '''
    Method to calculate the running mean and standard deviation of the state space.
    See https://www.johndcook.com/blog/standard_deviation/ for more details
    for n >= 1
        M_n = M_n-1 + (state - M_n-1) / n
        S_n = S_n-1 + (state - M_n-1) * (state - M_n)
        variance = S_n / (n - 1)
        std_dev = sqrt(variance)
    '''
    logger.debug(f'mean: {body.state_mean}, std: {body.state_std_dev}, num examples: {body.state_n}')
    # Assumes only one state is given
    if ("Atari" in body.memory.__class__.__name__):
        assert state.ndim == 3
    elif getattr(body.memory, 'raw_state_dim', False):
        assert state.size == body.memory.raw_state_dim
    else:
        assert state.size == body.state_dim or state.shape == body.state_dim
    mean = body.state_mean
    body.state_n += 1
    if np.isnan(mean).any():
        assert np.isnan(body.state_std_dev_int)
        assert np.isnan(body.state_std_dev)
        body.state_mean = state
        body.state_std_dev_int = 0
        body.state_std_dev = 0
    else:
        assert body.state_n > 1
        body.state_mean = mean + (state - mean) / body.state_n
        body.state_std_dev_int = body.state_std_dev_int + (state - mean) * (state - body.state_mean)
        body.state_std_dev = np.sqrt(body.state_std_dev_int / (body.state_n - 1))
        # Guard against very small std devs
        if (body.state_std_dev < 1e-8).any():
            body.state_std_dev[np.where(body.state_std_dev < 1e-8)] += 1e-8
    logger.debug(f'new mean: {body.state_mean}, new std: {body.state_std_dev}, num examples: {body.state_n}')
Ejemplo n.º 34
0
 def reset(self, state_space):
     logger.debug('AgentSpace.reset')
     _action_v, _loss_v, _explore_var_v = self.aeb_space.init_data_v(AGENT_DATA_NAMES)
     for agent in self.agents:
         state_a = state_space.get(a=agent.a)
         agent.reset(state_a)
     _action_space, _loss_space, _explore_var_space = self.aeb_space.add(AGENT_DATA_NAMES, [_action_v, _loss_v, _explore_var_v])
     return _action_space
Ejemplo n.º 35
0
 def reset(self):
     logger.debug('EnvSpace.reset')
     _reward_v, state_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES)
     for env in self.envs:
         _reward_e, state_e, done_e = env.reset()
         state_v[env.e, 0:len(state_e)] = state_e
         done_v[env.e, 0:len(done_e)] = done_e
     _reward_space, state_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, [_reward_v, state_v, done_v])
     logger.debug(f'\nstate_space: {state_space}')
     return _reward_space, state_space, done_space
Ejemplo n.º 36
0
 def calc_val_loss(self, batch, v_targets):
     '''Calculate the critic's value loss'''
     v_targets = v_targets.unsqueeze(dim=-1)
     v_preds = self.calc_v(batch['states'], evaluate=False).unsqueeze_(dim=-1)
     assert v_preds.shape == v_targets.shape
     val_loss = self.val_loss_coef * self.net.loss_fn(v_preds, v_targets)
     if torch.cuda.is_available() and self.net.gpu:
         val_loss = val_loss.cuda()
     logger.debug(f'Critic value loss: {val_loss:.2f}')
     return val_loss
Ejemplo n.º 37
0
 def act(self, state_space):
     data_names = ['action']
     action_v, = self.aeb_space.init_data_v(data_names)
     for agent in self.agents:
         a = agent.a
         state_a = state_space.get(a=a)
         action_a = agent.act(state_a)
         action_v[a, 0:len(action_a)] = action_a
     action_space, = self.aeb_space.add(data_names, [action_v])
     logger.debug(f'\naction_space: {action_space}')
     return action_space
Ejemplo n.º 38
0
 def step(self, action_space):
     reward_v, state_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES)
     for env in self.envs:
         e = env.e
         action_e = action_space.get(e=e)
         reward_e, state_e, done_e = env.step(action_e)
         reward_v[e, 0:len(reward_e)] = reward_e
         state_v[e, 0:len(state_e)] = state_e
         done_v[e, 0:len(done_e)] = done_e
     reward_space, state_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, [reward_v, state_v, done_v])
     logger.debug(f'\nreward_space: {reward_space}\nstate_space: {state_space}\ndone_space: {done_space}')
     return reward_space, state_space, done_space
Ejemplo n.º 39
0
 def calc_policy_loss(self, batch, advs):
     '''Calculate the actor's policy loss'''
     assert len(self.body.log_probs) == len(advs), f'{len(self.body.log_probs)} vs {len(advs)}'
     log_probs = torch.stack(self.body.log_probs)
     policy_loss = - self.policy_loss_coef * log_probs * advs
     if self.add_entropy:
         entropies = torch.stack(self.body.entropies)
         policy_loss += (-self.entropy_coef * entropies)
     policy_loss = torch.mean(policy_loss)
     if torch.cuda.is_available() and self.net.gpu:
         policy_loss = policy_loss.cuda()
     logger.debug(f'Actor policy loss: {policy_loss:.2f}')
     return policy_loss
Ejemplo n.º 40
0
 def train(self):
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     if self.to_train == 1:
         batch = self.sample()
         loss = self.calc_policy_loss(batch)
         self.net.training_step(loss=loss)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Policy loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 41
0
 def update_nets(self):
     total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if self.net.update_type == 'replace':
         if total_t % self.net.update_frequency == 0:
             logger.debug('Updating target_net by replacing')
             self.target_net.load_state_dict(self.net.state_dict())
             self.online_net = self.target_net
             self.eval_net = self.target_net
     elif self.net.update_type == 'polyak':
         logger.debug('Updating net by averaging')
         net_util.polyak_update(self.net, self.target_net, self.net.polyak_coef)
         self.online_net = self.target_net
         self.eval_net = self.target_net
     else:
         raise ValueError('Unknown net.update_type. Should be "replace" or "polyak". Exiting.')
Ejemplo n.º 42
0
 def update(self, action_space, reward_space, state_space, done_space):
     data_names = ['loss', 'explore_var']
     loss_v, explore_var_v = self.aeb_space.init_data_v(data_names)
     for agent in self.agents:
         a = agent.a
         action_a = action_space.get(a=a)
         reward_a = reward_space.get(a=a)
         state_a = state_space.get(a=a)
         done_a = done_space.get(a=a)
         loss_a, explore_var_a = agent.update(action_a, reward_a, state_a, done_a)
         loss_v[a, 0:len(loss_a)] = loss_a
         explore_var_v[a, 0:len(explore_var_a)] = explore_var_a
     loss_space, explore_var_space = self.aeb_space.add(data_names, [loss_v, explore_var_v])
     logger.debug(f'\nloss_space: {loss_space}\nexplore_var_space: {explore_var_space}')
     return loss_space, explore_var_space
Ejemplo n.º 43
0
def fn_decay_lr(net, fn):
    '''
    Decay learning rate for net module, only returns the new lr for user to set to appropriate nets
    In the future, might add more flexible lr adjustment, like boosting and decaying on need.
    '''
    space_clock = util.s_get(net.algorithm, 'aeb_space.clock')
    total_t = space_clock.get('total_t')
    start_val, end_val = net.optim_spec['lr'], 1e-6
    anneal_total_t = net.lr_anneal_timestep or max(10e6, 60 * net.lr_decay_frequency)

    if total_t >= net.lr_decay_min_timestep and total_t % net.lr_decay_frequency == 0:
        logger.debug(f'anneal_total_t: {anneal_total_t}, total_t: {total_t}')
        new_lr = fn(start_val, end_val, anneal_total_t, total_t)
        return new_lr
    else:
        return no_decay(net)
Ejemplo n.º 44
0
 def train_separate(self):
     '''
     Trains the network when the actor and critic are separate networks
     loss = val_loss + abs(policy_loss)
     '''
     if self.to_train == 1:
         batch = self.sample()
         policy_loss = self.train_actor(batch)
         val_loss = self.train_critic(batch)
         loss = val_loss + abs(policy_loss)
         # reset
         self.to_train = 0
         self.body.entropies = []
         self.body.log_probs = []
         logger.debug(f'Total loss: {loss:.2f}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 45
0
 def train_separate(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     if self.to_train == 1:
         batch = self.sample()
         policy_loss = self.train_actor(batch)
         val_loss = self.train_critic(batch)
         loss = val_loss + abs(policy_loss)
         net_util.copy(self.net, self.old_net)
         net_util.copy(self.critic, self.old_critic)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss:.2f}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 46
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     Otherwise this function does nothing.
     '''
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     if self.to_train == 1:
         batch = self.sample()
         with torch.no_grad():
             q_targets = self.calc_q_targets(batch)
         loss = self.net.training_step(batch['states'], q_targets)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 47
0
 def training_step(self, x=None, y=None, loss=None, retain_graph=False):
     '''Takes a single training step: one forward and one backwards pass'''
     self.train()
     self.zero_grad()
     self.optim.zero_grad()
     if loss is None:
         out = self(x)
         loss = self.loss_fn(out, y)
     assert not torch.isnan(loss).any()
     if net_util.to_assert_trained():
         assert_trained = net_util.gen_assert_trained(self.conv_model)
     loss.backward(retain_graph=retain_graph)
     if self.clip_grad:
         logger.debug(f'Clipping gradient')
         torch.nn.utils.clip_grad_norm(self.parameters(), self.clip_grad_val)
     self.optim.step()
     if net_util.to_assert_trained():
         assert_trained(self.conv_model)
     return loss
Ejemplo n.º 48
0
 def train_shared(self):
     '''
     Trains the network when the actor and critic share parameters
     loss = self.policy_loss_coef * policy_loss + self.val_loss_coef * val_loss
     '''
     if self.to_train == 1:
         batch = self.sample()
         with torch.no_grad():
             advs, v_targets = self.calc_advs_v_targets(batch)
         policy_loss = self.calc_policy_loss(batch, advs)  # from actor
         val_loss = self.calc_val_loss(batch, v_targets)  # from critic
         loss = policy_loss + val_loss
         self.net.training_step(loss=loss)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Total loss: {loss:.2f}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 49
0
    def calc_sil_policy_val_loss(self, batch):
        '''
        Calculate the SIL policy losses for actor and critic
        sil_policy_loss = -log_prob * max(R - v_pred, 0)
        sil_val_loss = (max(R - v_pred, 0)^2) / 2
        This is called on a randomly-sample batch from experience replay
        '''
        returns = math_util.calc_returns(batch, self.gamma)
        v_preds = self.calc_v(batch['states'], evaluate=False)
        clipped_advs = torch.clamp(returns - v_preds, min=0.0)
        log_probs = self.calc_log_probs(batch)

        sil_policy_loss = self.sil_policy_loss_coef * torch.mean(- log_probs * clipped_advs)
        sil_val_loss = self.sil_val_loss_coef * torch.pow(clipped_advs, 2) / 2
        sil_val_loss = torch.mean(sil_val_loss)

        if torch.cuda.is_available() and self.net.gpu:
            sil_policy_loss = sil_policy_loss.cuda()
            sil_val_loss = sil_val_loss.cuda()
        logger.debug(f'SIL actor policy loss: {sil_policy_loss:.2f}')
        logger.debug(f'SIL critic value loss: {sil_val_loss:.2f}')
        return sil_policy_loss, sil_val_loss
Ejemplo n.º 50
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
     self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0)
     is_per = util.get_class_name(self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay'
     if self.to_train == 1:
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             batch = self.sample()
             for _ in range(self.training_batch_epoch):
                 with torch.no_grad():
                     q_targets = self.calc_q_targets(batch)
                     if is_per:
                         q_preds = self.net.wrap_eval(batch['states'])
                         errors = torch.abs(q_targets - q_preds)
                         errors = errors.sum(dim=1).unsqueeze_(dim=1)
                         for body in self.agent.nanflat_body_a:
                             body.memory.update_priorities(errors)
                 loss = self.net.training_step(batch['states'], q_targets)
                 total_loss += loss.cpu()
         loss = total_loss / (self.training_epoch * self.training_batch_epoch)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 51
0
 def time_fn(*args, **kwargs):
     start = time.time()
     output = fn(*args, **kwargs)
     end = time.time()
     logger.debug(f'Timed: {fn.__name__} {round((end - start) * 1000, 4)}ms')
     return output
Ejemplo n.º 52
0
 def check_api(*args, **kwargs):
     # TODO name-based data check for api methods
     output = fn(*args, **kwargs)
     logger.debug(f'API method: {fn.__name__}, output: {output}')
     return output
Ejemplo n.º 53
0
 def __init__(self):
     logger.debug('Monitor initialized.')