コード例 #1
0
 def train_shared(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     clock = self.body.env.clock
     if self.to_train == 1:
         # update old net
         torch.cuda.empty_cache()
         net_util.copy(self.net, self.old_net)
         batch = self.sample()
         total_loss = torch.tensor(0.0, device=self.net.device)
         for _ in range(self.training_epoch):
             with torch.no_grad():
                 advs, v_targets = self.calc_advs_v_targets(batch)
             policy_loss = self.calc_policy_loss(batch, advs)  # from actor
             val_loss = self.calc_val_loss(batch, v_targets)  # from critic
             loss = policy_loss + val_loss
             # retain for entropies etc.
             self.net.training_step(loss=loss,
                                    lr_clock=clock,
                                    retain_graph=True)
             total_loss += loss
         loss = total_loss / self.training_epoch
         # reset
         self.to_train = 0
         self.body.flush()
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
コード例 #2
0
ファイル: ppo.py プロジェクト: xiangshengcn/SLM-Lab
    def train_shared(self):
        '''
        Trains the network when the actor and critic share parameters
        '''
        if self.to_train == 1:
            # update old net
            net_util.copy(self.net, self.old_net)
            batch = self.sample()
            total_loss = torch.tensor(0.0, device=self.net.device)
            for _ in range(self.training_epoch):
                with torch.no_grad():
                    advs, v_targets = self.calc_advs_v_targets(batch)
                policy_loss = self.calc_policy_loss(batch, advs)  # from actor
                val_loss = self.calc_val_loss(batch, v_targets)  # from critic
                loss = policy_loss + val_loss
                # retain for entropies etc.
                self.net.training_step(loss=loss, retain_graph=True, global_net=self.global_nets.get('net'))
                total_loss += loss
            loss = total_loss / self.training_epoch
            # reset
            self.to_train = 0
            self.body.entropies = []
            self.body.log_probs = []
            logger.debug(f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}')

            return loss.item()
        else:
            return np.nan
コード例 #3
0
ファイル: ppo.py プロジェクト: vhcg77/SLM-Lab
 def train_separate(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     if self.to_train == 1:
         batch = self.sample()
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             loss = self.calc_loss(batch)
             # to reuse loss for critic
             loss.backward = partial(loss.backward, retain_graph=True)
             self.net.training_step(loss=loss)
             # critic.optim.step using the same loss
             loss.backward = partial(loss.backward, retain_graph=False)
             self.critic.training_step(loss=loss)
             total_loss += loss
         loss = total_loss.mean()
         net_util.copy(self.net, self.old_net)
         net_util.copy(self.critic, self.old_critic)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss:.2f}')
         self.last_loss = loss.item()
     return self.last_loss
コード例 #4
0
ファイル: ppo.py プロジェクト: ronald-xie/SLM-Lab
 def train_shared(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     if self.to_train == 1:
         batch = self.sample()
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             with torch.no_grad():
                 advs, v_targets = self.calc_advs_v_targets(batch)
             policy_loss = self.calc_policy_loss(batch, advs)  # from actor
             val_loss = self.calc_val_loss(batch, v_targets)  # from critic
             loss = policy_loss + val_loss
             # retain for entropies etc.
             self.net.training_step(loss=loss, retain_graph=True)
             total_loss += loss.cpu()
         loss = total_loss / self.training_epoch
         net_util.copy(self.net, self.old_net)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss:.2f}')
         self.last_loss = loss.item()
     return self.last_loss
コード例 #5
0
 def train_step(self, loss, optim, lr_scheduler=None, clock=None, global_net=None):
     if lr_scheduler is not None:
         lr_scheduler.step(epoch=ps.get(clock, 'frame'))
     optim.zero_grad()
     loss.backward()
     if self.previous_grads is not None:
         total_norm = 0.0
         for p in self.parameters():
             param_norm = p.grad.data.norm(2)
             total_norm += param_norm.item() ** 2
         total_norm = total_norm ** (1. / 2)
         self.previous_grads.append(total_norm)
         average_grad = sum(self.previous_grads) / len(self.previous_grads)
         if self.clip_grad_val is not None:
             nn.utils.clip_grad_norm_(self.parameters(), min(average_grad * self.grad_scaler, self.clip_grad_val))
         else:
             nn.utils.clip_grad_norm_(self.parameters(), average_grad * self.grad_scaler)
     elif self.clip_grad_val is not None:
         nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
     if global_net is not None:
         net_util.push_global_grads(self, global_net)
     optim.step()
     if global_net is not None:
         net_util.copy(global_net, self)
     if clock is not None:
         clock.tick('opt_step')
     return loss
コード例 #6
0
ファイル: ppo.py プロジェクト: tttor/SLM-Lab
 def train_shared(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     if self.to_train == 1:
         batch = self.sample()
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             with torch.no_grad():
                 advs, v_targets = self.calc_advs_v_targets(batch)
             policy_loss = self.calc_policy_loss(batch, advs)  # from actor
             val_loss = self.calc_val_loss(batch, v_targets)  # from critic
             loss = policy_loss + val_loss
             # retain for entropies etc.
             self.net.training_step(loss=loss, retain_graph=True)
             total_loss += loss.cpu()
         loss = total_loss / self.training_epoch
         net_util.copy(self.net, self.old_net)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss:.2f}')
         self.last_loss = loss.item()
     return self.last_loss
コード例 #7
0
ファイル: sac.py プロジェクト: jimfleming/SLM-Lab
 def update_nets(self):
     '''Update target networks'''
     if util.frame_mod(self.body.env.clock.frame, self.q1_net.update_frequency, self.body.env.num_envs):
         if self.q1_net.update_type == 'replace':
             net_util.copy(self.q1_net, self.target_q1_net)
             net_util.copy(self.q2_net, self.target_q2_net)
         elif self.q1_net.update_type == 'polyak':
             net_util.polyak_update(self.q1_net, self.target_q1_net, self.q1_net.polyak_coef)
             net_util.polyak_update(self.q2_net, self.target_q2_net, self.q2_net.polyak_coef)
         else:
             raise ValueError('Unknown q1_net.update_type. Should be "replace" or "polyak". Exiting.')
コード例 #8
0
ファイル: base.py プロジェクト: ssfve/SLM-Lab
 def train_step(self, loss, optim, lr_scheduler, clock, global_net=None):
     lr_scheduler.step(epoch=ps.get(clock, 'frame'))
     optim.zero_grad()
     loss.backward()
     if self.clip_grad_val is not None:
         nn.utils.clip_grad_norm_(self.parameters(), self.clip_grad_val)
     if global_net is not None:
         net_util.push_global_grads(self, global_net)
     optim.step()
     if global_net is not None:
         net_util.copy(global_net, self)
     clock.tick('opt_step')
     return loss
コード例 #9
0
ファイル: dqn.py プロジェクト: vmuthuk2/SLM-Lab
 def update_nets(self):
     total_t = self.body.env.clock.total_t
     if total_t % self.net.update_frequency == 0:
         if self.net.update_type == 'replace':
             logger.debug('Updating target_net by replacing')
             net_util.copy(self.net, self.target_net)
         elif self.net.update_type == 'polyak':
             logger.debug('Updating net by averaging')
             net_util.polyak_update(self.net, self.target_net,
                                    self.net.polyak_coef)
         else:
             raise ValueError(
                 'Unknown net.update_type. Should be "replace" or "polyak". Exiting.'
             )
コード例 #10
0
 def train(self):
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         net_util.copy(self.net, self.old_net)  # update old net
         batch = self.sample()
         clock.set_batch_size(len(batch))
         with torch.no_grad():
             states = batch['states']
             if self.body.env.is_venv:
                 states = math_util.venv_unpack(states)
             # NOTE states is massive with batch_size = time_horizon * num_envs. Chunk up so forward pass can fit into device esp. GPU
             num_chunks = int(len(states) / self.minibatch_size)
             v_preds_chunks = [self.calc_v(states_chunk, use_cache=False) for states_chunk in torch.chunk(states, num_chunks)]
             v_preds = torch.cat(v_preds_chunks)
             advs, v_targets = self.calc_advs_v_targets(batch, v_preds)
         # piggy back on batch, but remember to not pack or unpack
         batch['advs'], batch['v_targets'] = advs, v_targets
         if self.body.env.is_venv:  # unpack if venv for minibatch sampling
             for k, v in batch.items():
                 if k not in ('advs', 'v_targets'):
                     batch[k] = math_util.venv_unpack(v)
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             minibatches = util.split_minibatch(batch, self.minibatch_size)
             for minibatch in minibatches:
                 if self.body.env.is_venv:  # re-pack to restore proper shape
                     for k, v in minibatch.items():
                         if k not in ('advs', 'v_targets'):
                             minibatch[k] = math_util.venv_pack(v, self.body.env.num_envs)
                 advs, v_targets = minibatch['advs'], minibatch['v_targets']
                 pdparams, v_preds = self.calc_pdparam_v(minibatch)
                 policy_loss = self.calc_policy_loss(minibatch, pdparams, advs)  # from actor
                 val_loss = self.calc_val_loss(v_preds, v_targets)  # from critic
                 if self.shared:  # shared network
                     loss = policy_loss + val_loss
                     self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
                 else:
                     self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
                     self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net)
                     loss = policy_loss + val_loss
                 total_loss += loss
         loss = total_loss / self.training_epoch / len(minibatches)
         # reset
         self.to_train = 0
         logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}')
         return loss.item()
     else:
         return np.nan
コード例 #11
0
ファイル: sac.py プロジェクト: jianghongping/SLM-Lab
    def init_nets(self, global_nets=None):
        '''
        Networks: net(actor/policy), q1_net, target_q1_net, q2_net, target_q2_net
        All networks are separate, and have the same hidden layer architectures and optim specs, so tuning is minimal
        '''
        self.shared = False  # SAC does not share networks
        NetClass = getattr(net, self.net_spec['type'])
        # main actor network
        self.net = NetClass(self.net_spec, self.body.state_dim,
                            net_util.get_out_dim(self.body))
        self.net_names = ['net']
        # two critic Q-networks to mitigate positive bias in q_loss and speed up training, uses q_net.py with prefix Q
        QNetClass = getattr(net, 'Q' + self.net_spec['type'])
        q_in_dim = [self.body.state_dim, self.body.action_dim]
        self.q1_net = QNetClass(self.net_spec, q_in_dim, 1)
        self.target_q1_net = QNetClass(self.net_spec, q_in_dim, 1)
        self.q2_net = QNetClass(self.net_spec, q_in_dim, 1)
        self.target_q2_net = QNetClass(self.net_spec, q_in_dim, 1)
        self.net_names += [
            'q1_net', 'target_q1_net', 'q2_net', 'target_q2_net'
        ]
        net_util.copy(self.q1_net, self.target_q1_net)
        net_util.copy(self.q2_net, self.target_q2_net)
        # temperature variable to be learned, and its target entropy
        self.log_alpha = torch.zeros(1,
                                     requires_grad=True,
                                     device=self.net.device)
        self.alpha = self.log_alpha.detach().exp()
        if self.body.is_discrete:
            self.target_entropy = -self.body.action_space.n
        else:
            self.target_entropy = -np.product(self.body.action_space.shape)

        # init net optimizer and its lr scheduler
        self.optim = net_util.get_optim(self.net, self.net.optim_spec)
        self.lr_scheduler = net_util.get_lr_scheduler(
            self.optim, self.net.lr_scheduler_spec)
        self.q1_optim = net_util.get_optim(self.q1_net, self.q1_net.optim_spec)
        self.q1_lr_scheduler = net_util.get_lr_scheduler(
            self.q1_optim, self.q1_net.lr_scheduler_spec)
        self.q2_optim = net_util.get_optim(self.q2_net, self.q2_net.optim_spec)
        self.q2_lr_scheduler = net_util.get_lr_scheduler(
            self.q2_optim, self.q2_net.lr_scheduler_spec)
        self.alpha_optim = net_util.get_optim(self.log_alpha,
                                              self.net.optim_spec)
        self.alpha_lr_scheduler = net_util.get_lr_scheduler(
            self.alpha_optim, self.net.lr_scheduler_spec)
        net_util.set_global_nets(self, global_nets)
        self.post_init_nets()
コード例 #12
0
 def train_separate(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     if self.to_train == 1:
         net_util.copy(self.net, self.old_net)
         batch = self.sample()
         policy_loss = self.train_actor(batch)
         val_loss = self.train_critic(batch)
         loss = val_loss + abs(policy_loss)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss:.4f}')
         self.last_loss = loss.item()
     return self.last_loss
コード例 #13
0
ファイル: ppo.py プロジェクト: ronald-xie/SLM-Lab
 def train_separate(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     if self.to_train == 1:
         batch = self.sample()
         policy_loss = self.train_actor(batch)
         val_loss = self.train_critic(batch)
         loss = val_loss + abs(policy_loss)
         net_util.copy(self.net, self.old_net)
         net_util.copy(self.critic, self.old_critic)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss:.2f}')
         self.last_loss = loss.item()
     return self.last_loss
コード例 #14
0
ファイル: ppo.py プロジェクト: xiangshengcn/SLM-Lab
    def train_separate(self):
        '''
        Trains the network when the actor and critic share parameters
        '''
        if self.to_train == 1:
            net_util.copy(self.net, self.old_net)
            batch = self.sample()
            policy_loss = self.train_actor(batch)
            val_loss = self.train_critic(batch)
            loss = val_loss + abs(policy_loss)
            # reset
            self.to_train = 0
            self.body.entropies = []
            self.body.log_probs = []
            logger.debug(f'Trained {self.name} at epi: {self.body.env.clock.get("epi")}, total_t: {self.body.env.clock.get("total_t")}, t: {self.body.env.clock.get("t")}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:.8f}')

            return loss.item()
        else:
            return np.nan
コード例 #15
0
ファイル: ppo.py プロジェクト: vhcg77/SLM-Lab
 def train_shared(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     if self.to_train == 1:
         batch = self.sample()
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             loss = self.calc_loss(batch)
             self.net.training_step(loss=loss)
             total_loss += loss
         loss = total_loss.mean()
         net_util.copy(self.net, self.old_net)
         # reset
         self.to_train = 0
         self.body.log_probs = []
         self.body.entropies = []
         logger.debug(f'Loss: {loss:.2f}')
         self.last_loss = loss.item()
     return self.last_loss
コード例 #16
0
 def train_separate(self):
     '''
     Trains the network when the actor and critic share parameters
     '''
     clock = self.body.env.clock
     if self.to_train == 1:
         torch.cuda.empty_cache()
         net_util.copy(self.net, self.old_net)
         batch = self.sample()
         policy_loss = self.train_actor(batch)
         val_loss = self.train_critic(batch)
         loss = val_loss + policy_loss
         # reset
         self.to_train = 0
         self.body.flush()
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan