Ejemplo n.º 1
0
 def train(self):
     '''Completes one training step for the agent if it is time to train.
        i.e. the environment timestep is greater than the minimum training
        timestep and a multiple of the training_frequency.
        Each training step consists of sampling n batches from the agent's memory.
           For each of the batches, the target Q values (q_targets) are computed and
           a single training step is taken k times
        Otherwise this function does nothing.
     '''
     t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if (t > self.training_min_timestep and t % self.training_frequency == 0):
         logger.debug(f'Training at t: {t}')
         nanflat_loss_a = np.zeros(self.agent.body_num)
         for _b in range(self.training_epoch):
             batch_losses = np.zeros(self.agent.body_num)
             batch = self.sample()
             for _i in range(self.training_iters_per_batch):
                 q_targets = self.compute_q_target_values(batch)
                 y = [Variable(q) for q in q_targets]
                 losses = self.net.training_step(batch['states'], y)
                 logger.debug(f'losses {losses}')
                 batch_losses += losses
             batch_losses /= self.training_iters_per_batch
             nanflat_loss_a += batch_losses
         nanflat_loss_a /= self.training_epoch
         loss_a = self.nanflat_to_data_a('loss', nanflat_loss_a)
         return loss_a
     else:
         logger.debug('NOT training')
         return np.nan
Ejemplo n.º 2
0
 def space_train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     if util.in_eval_lab_modes():
         self.body.flush()
         return np.nan
     clock = self.body.env.clock  # main clock
     tick = util.s_get(self, 'aeb_space.clock').get(clock.max_tick_unit)
     self.to_train = (tick > self.training_start_step and tick % self.training_frequency == 0)
     if self.to_train == 1:
         total_loss = torch.tensor(0.0, device=self.net.device)
         for _ in range(self.training_epoch):
             batch = self.space_sample()
             for _ in range(self.training_batch_epoch):
                 loss = self.calc_q_loss(batch)
                 self.net.training_step(loss=loss, lr_clock=clock)
                 total_loss += loss
         loss = total_loss / (self.training_epoch * self.training_batch_epoch)
         # reset
         self.to_train = 0
         for body in self.agent.nanflat_body_a:
             body.flush()
         logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
         return loss.item()
     else:
         return np.nan
Ejemplo n.º 3
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
     self.to_train = (total_t > self.training_min_timestep
                      and total_t % self.training_frequency == 0)
     if self.to_train == 1:
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             batch = self.sample()
             for _ in range(self.training_batch_epoch):
                 with torch.no_grad():
                     q_targets = self.calc_q_targets(batch)
                 loss = self.net.training_step(batch['states'], q_targets)
                 total_loss += loss
         loss = total_loss / (self.training_epoch *
                              self.training_batch_epoch)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 4
0
 def update_explore_var(self):
     '''Updates the explore variables'''
     space_clock = util.s_get(self, 'aeb_space.clock')
     nanflat_explore_var_a = self.action_policy_update(self, space_clock)
     explore_var_a = self.nanflat_to_data_a(
         'explore_var', nanflat_explore_var_a)
     return explore_var_a
Ejemplo n.º 5
0
 def train(self):
     '''Completes one training step for the agent if it is time to train.
        i.e. the environment timestep is greater than the minimum training
        timestep and a multiple of the training_frequency.
        Each training step consists of sampling n batches from the agent's memory.
           For each of the batches, the target Q values (q_targets) are computed and
           a single training step is taken k times
        Otherwise this function does nothing.
     '''
     t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if (t > self.training_min_timestep and t % self.training_frequency == 0):
         logger.debug(f'Training at t: {t}')
         total_loss = 0.0
         for _b in range(self.training_epoch):
             batch = self.sample()
             batch_loss = 0.0
             for _i in range(self.training_iters_per_batch):
                 q_targets = self.compute_q_target_values(batch)
                 y = Variable(q_targets)
                 loss = self.net.training_step(batch['states'], y)
                 batch_loss += loss.data[0]
             batch_loss /= self.training_iters_per_batch
             total_loss += batch_loss
         total_loss /= self.training_epoch
         logger.debug(f'total_loss {total_loss}')
         return total_loss
     else:
         logger.debug('NOT training')
         return np.nan
Ejemplo n.º 6
0
 def update(self):
     '''Update the agent after training'''
     space_clock = util.s_get(self, 'aeb_space.clock')
     for net in [self.net]:
         net.update_lr(space_clock)
     explore_vars = [self.action_policy_update(self, body) for body in self.agent.nanflat_body_a]
     explore_var_a = self.nanflat_to_data_a('explore_var', explore_vars)
     return explore_var_a
Ejemplo n.º 7
0
 def train(self):
     '''Completes one training step for the agent if it is time to train.
        i.e. the environment timestep is greater than the minimum training
        timestep and a multiple of the training_frequency.
        Each training step consists of sampling n batches from the agent's memory.
           For each of the batches, the target Q values (q_targets) are computed and
           a single training step is taken k times
        Otherwise this function does nothing.
     '''
     t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if (t > self.training_min_timestep
             and t % self.training_frequency == 0):
         logger.debug(f'Training at t: {t}')
         total_loss = 0.0
         total_losses = None
         for _b in range(self.training_epoch):
             batch = self.sample()
             batch_loss = 0.0
             batch_losses = None
             for _i in range(self.training_iters_per_batch):
                 q_targets = self.compute_q_target_values(batch)
                 y = []
                 for q in q_targets:
                     y.append(Variable(q))
                 loss, losses = self.net.training_step(batch['states'], y)
                 logger.debug(f'loss {loss}')
                 logger.debug(f'losses {losses}')
                 batch_loss += loss
                 if batch_losses is None:
                     batch_losses = losses
                 else:
                     batch_losses = [
                         sum(x) for x in zip(batch_losses, losses)
                     ]
             batch_loss /= self.training_iters_per_batch
             batch_losses = [
                 float(x) / self.training_iters_per_batch
                 for x in batch_losses
             ]
             total_loss += batch_loss
             if total_losses is None:
                 total_losses = batch_losses
             else:
                 total_losses = [
                     sum(x) for x in zip(total_losses, batch_losses)
                 ]
         total_loss /= self.training_epoch
         total_losses = [
             float(x) / self.training_epoch for x in total_losses
         ]
         if t % 25 == 0:
             logger.info(f'total_loss {total_loss}')
             logger.info(f'total losses {total_losses}')
         # TODO: Return other losses as well.
         return total_loss
     else:
         logger.debug('NOT training')
         return np.nan
Ejemplo n.º 8
0
 def update_nets(self):
     res = super(DoubleDQN, self).update_nets()
     total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if self.net.update_type == 'replace':
         if total_t % self.net.update_frequency == 0:
             self.online_net = self.net
             self.eval_net = self.target_net
     elif self.net.update_type == 'polyak':
         self.online_net = self.net
         self.eval_net = self.target_net
Ejemplo n.º 9
0
 def update_nets(self):
     res = super(DoubleDQN, self).update_nets()
     total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if self.net.update_type == 'replace':
         if total_t % self.net.update_frequency == 0:
             self.online_net = self.net
             self.eval_net = self.target_net
     elif self.net.update_type == 'polyak':
         self.online_net = self.net
         self.eval_net = self.target_net
Ejemplo n.º 10
0
def decay_learning_rate(algo, nets):
    '''
    Decay learning rate for each net by the decay method update_lr() defined in them.
    In the future, might add more flexible lr adjustment, like boosting and decaying on need.
    '''
    space_clock = util.s_get(algo, 'aeb_space.clock')
    t = space_clock.get('total_t')
    if algo.decay_lr and t > algo.decay_lr_min_timestep:
        if t % algo.decay_lr_frequency == 0:
            for net in nets:
                net.update_lr()
Ejemplo n.º 11
0
 def update(self):
     super(DoubleDQN, self).update()
     space_clock = util.s_get(self, 'aeb_space.clock')
     t = space_clock.get('t')
     if self.update_type == 'replace':
         if t % self.update_frequency == 0:
             self.online_net = self.net
             self.eval_net = self.target_net
     elif self.update_type == 'polyak':
         self.online_net = self.net
         self.eval_net = self.target_net
     return self.explore_var
Ejemplo n.º 12
0
 def update(self):
     space_clock = util.s_get(self, 'aeb_space.clock')
     nets = [self.net
             ] if self.share_architecture else [self.net, self.critic]
     for net in nets:
         net.update_lr(space_clock)
     explore_vars = [
         self.action_policy_update(self, body)
         for body in self.agent.nanflat_body_a
     ]
     explore_var_a = self.nanflat_to_data_a('explore_var', explore_vars)
     return explore_var_a
Ejemplo n.º 13
0
 def update_nets(self):
     total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if self.net.update_type == 'replace':
         if total_t % self.net.update_frequency == 0:
             logger.debug('Updating target_net by replacing')
             self.target_net.load_state_dict(self.net.state_dict())
             self.online_net = self.target_net
             self.eval_net = self.target_net
     elif self.net.update_type == 'polyak':
         logger.debug('Updating net by averaging')
         net_util.polyak_update(self.net, self.target_net, self.net.polyak_coef)
         self.online_net = self.target_net
         self.eval_net = self.target_net
     else:
         raise ValueError('Unknown net.update_type. Should be "replace" or "polyak". Exiting.')
Ejemplo n.º 14
0
def fn_decay_lr(net, fn):
    '''
    Decay learning rate for net module, only returns the new lr for user to set to appropriate nets
    In the future, might add more flexible lr adjustment, like boosting and decaying on need.
    '''
    space_clock = util.s_get(net.algorithm, 'aeb_space.clock')
    total_t = space_clock.get('total_t')
    start_val, end_val = net.optim_spec['lr'], 1e-6
    anneal_total_t = net.lr_anneal_timestep or max(10e6, 60 * net.lr_decay_frequency)

    if total_t >= net.lr_decay_min_timestep and total_t % net.lr_decay_frequency == 0:
        logger.debug(f'anneal_total_t: {anneal_total_t}, total_t: {total_t}')
        new_lr = fn(start_val, end_val, anneal_total_t, total_t)
        return new_lr
    else:
        return no_decay(net)
Ejemplo n.º 15
0
    def space_train(self):
        '''
        Completes one training step for the agent if it is time to train.
        i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
        Each training step consists of sampling n batches from the agent's memory.
        For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
        Otherwise this function does nothing.
        '''
        if util.get_lab_mode() == 'enjoy':
            return np.nan
        total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
        self.to_train = (total_t > self.training_min_timestep
                         and total_t % self.training_frequency == 0)
        is_per = util.get_class_name(
            self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay'
        if self.to_train == 1:
            total_loss = torch.tensor(0.0, device=self.net.device)
            for _ in range(self.training_epoch):
                batch = self.space_sample()
                for _ in range(self.training_batch_epoch):
                    with torch.no_grad():
                        q_targets = self.calc_q_targets(batch)
                        if is_per:
                            q_preds = self.net.wrap_eval(batch['states'])
                            errors = torch.abs(q_targets - q_preds)
                            errors = errors.sum(dim=1).unsqueeze_(dim=1)
                            for body in self.agent.nanflat_body_a:
                                body.memory.update_priorities(errors)
                    loss = self.net.training_step(
                        batch['states'],
                        q_targets,
                        global_net=self.global_nets.get('net'))
                    total_loss += loss
            loss = total_loss / (self.training_epoch *
                                 self.training_batch_epoch)
            # reset
            self.to_train = 0
            for body in self.agent.nanflat_body_a:
                body.entropies = []
                body.log_probs = []
            logger.debug(
                f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}'
            )

            return loss.item()
        else:
            return np.nan
Ejemplo n.º 16
0
def fn_decay_lr(net, fn):
    '''
    Decay learning rate for net module, only returns the new lr for user to set to appropriate nets
    In the future, might add more flexible lr adjustment, like boosting and decaying on need.
    '''
    space_clock = util.s_get(net.algorithm, 'aeb_space.clock')
    total_t = space_clock.get('total_t')
    start_val, end_val = net.optim_spec['lr'], 1e-6
    anneal_total_t = net.lr_anneal_timestep or max(10e6,
                                                   60 * net.lr_decay_frequency)

    if total_t >= net.lr_decay_min_timestep and total_t % net.lr_decay_frequency == 0:
        logger.debug(f'anneal_total_t: {anneal_total_t}, total_t: {total_t}')
        new_lr = fn(start_val, end_val, anneal_total_t, total_t)
        return new_lr
    else:
        return no_decay(net)
Ejemplo n.º 17
0
 def update_nets(self):
     # NOTE: Once polyak updating for multi-headed networks is supported via updates to flatten_params and load_params then this can be removed
     space_clock = util.s_get(self, 'aeb_space.clock')
     t = space_clock.get('t')
     if self.update_type == 'replace':
         if t % self.update_frequency == 0:
             logger.debug('Updating target_net by replacing')
             self.target_net = deepcopy(self.net)
             self.online_net = self.target_net
             self.eval_net = self.target_net
     elif self.update_type == 'polyak':
         logger.error(
             '"polyak" updating not supported yet for MultiHeadDQN, please use "replace" instead. Exiting.')
         sys.exit()
     else:
         logger.error(
             'Unknown net.update_type. Should be "replace" or "polyak". Exiting.')
         sys.exit()
Ejemplo n.º 18
0
 def update_nets(self):
     space_clock = util.s_get(self, 'aeb_space.clock')
     t = space_clock.get('t')
     if self.update_type == 'replace':
         if t % self.update_frequency == 0:
             logger.debug('Updating target_net by replacing')
             self.target_net = deepcopy(self.net)
             self.online_net = self.target_net
             self.eval_net = self.target_net
     elif self.update_type == 'polyak':
         logger.debug('Updating net by averaging')
         avg_params = self.polyak_weight * net_util.flatten_params(self.target_net) + \
             (1 - self.polyak_weight) * net_util.flatten_params(self.net)
         self.target_net = net_util.load_params(self.target_net, avg_params)
         self.online_net = self.target_net
         self.eval_net = self.target_net
     else:
         logger.error(
             'Unknown net.update_type. Should be "replace" or "polyak". Exiting.')
         sys.exit()
Ejemplo n.º 19
0
 def train(self):
     '''Completes one training step for the agent if it is time to train.
        Otherwise this function does nothing.
     '''
     t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if self.to_train == 1:
         logger.debug3(f'Training at t: {t}')
         batch = self.sample()
         if batch['states'].size(0) < 2:
             logger.info(f'Batch too small to train with, skipping...')
             self.to_train = 0
             return np.nan
         q_targets = self.compute_q_target_values(batch)
         if torch.cuda.is_available() and self.gpu:
             q_targets = q_targets.cuda()
         y = Variable(q_targets)
         loss = self.net.training_step(batch['states'], y)
         logger.debug(f'loss {loss.data[0]}')
         self.to_train = 0
         return loss.data[0]
     else:
         logger.debug3('NOT training')
         return np.nan
Ejemplo n.º 20
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     if util.get_lab_mode() == 'enjoy':
         return np.nan
     total_t = util.s_get(self, 'aeb_space.clock').get('total_t')
     self.to_train = (total_t > self.training_min_timestep and total_t % self.training_frequency == 0)
     is_per = util.get_class_name(self.agent.nanflat_body_a[0].memory) == 'PrioritizedReplay'
     if self.to_train == 1:
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             batch = self.sample()
             for _ in range(self.training_batch_epoch):
                 with torch.no_grad():
                     q_targets = self.calc_q_targets(batch)
                     if is_per:
                         q_preds = self.net.wrap_eval(batch['states'])
                         errors = torch.abs(q_targets - q_preds)
                         errors = errors.sum(dim=1).unsqueeze_(dim=1)
                         for body in self.agent.nanflat_body_a:
                             body.memory.update_priorities(errors)
                 loss = self.net.training_step(batch['states'], q_targets)
                 total_loss += loss.cpu()
         loss = total_loss / (self.training_epoch * self.training_batch_epoch)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss}')
         self.last_loss = loss.item()
     return self.last_loss
Ejemplo n.º 21
0
def test_s_get(test_agent):
    spec = util.s_get(test_agent, 'aeb_space.spec')
    assert _.is_dict(spec)
    spec = util.s_get(test_agent, 'aeb_space').spec
    assert _.is_dict(spec)
Ejemplo n.º 22
0
def test_s_get(test_agent):
    spec = util.s_get(test_agent, 'aeb_space.spec')
    assert ps.is_dict(spec)
    spec = util.s_get(test_agent, 'aeb_space').spec
    assert ps.is_dict(spec)
Ejemplo n.º 23
0
 def update(self):
     '''Updates the explore variables'''
     space_clock = util.s_get(self, 'aeb_space.clock')
     self.action_policy_update(self, space_clock)
     return self.explore_var
Ejemplo n.º 24
0
 def update_explore_var(self):
     space_clock = util.s_get(self, 'aeb_space.clock')
     self.action_policy_update(self, space_clock)