コード例 #1
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_iter):
             batch = self.sample()
             clock.set_batch_size(len(batch))
             for _ in range(self.training_batch_iter):
                 loss = self.calc_q_loss(batch)
                 self.net.train_step(loss,
                                     self.optim,
                                     self.lr_scheduler,
                                     clock=clock,
                                     global_net=self.global_net)
                 total_loss += loss
         loss = total_loss / (self.training_iter * self.training_batch_iter)
         # reset
         self.to_train = 0
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
コード例 #2
0
ファイル: dqn.py プロジェクト: kiseliu/NeuralPipeline_DSTC8
 def train(self):
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_iter):
             batches = []
             if self.body.warmup_memory.size >= self.body.warmup_memory.batch_size:
                 batches.append(self.warmup_sample())
             if self.body.memory.size >= self.body.memory.batch_size:
                 batches.append(self.sample())
             clock.set_batch_size(sum(len(batch) for batch in batches))
             for batch in batches:
                 for _ in range(self.training_batch_iter):
                     loss = self.calc_q_loss(batch)
                     self.net.train_step(loss,
                                         self.optim,
                                         self.lr_scheduler,
                                         clock=clock,
                                         global_net=self.global_net)
                     total_loss += loss
         loss = total_loss / (self.training_iter * self.training_batch_iter)
         # reset
         self.to_train = 0
         logger.info(
             f'Trained {self.name} at epi: {clock.epi}, warmup_size: {self.body.warmup_memory.size}, memory_size: {self.body.memory.size}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
コード例 #3
0
    def update(self, obs, action, reward, next_obs, done):
        '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net'''
        # update state
        input_act, next_state, encoded_state = self.state_update(
            next_obs, action)

        # update body
        self.body.update(self.body.state, action, reward, next_state, done)

        # update memory
        if util.in_eval_lab_modes(
        ) or self.algorithm.__class__.__name__ == 'ExternalPolicy':  # eval does not update agent for training
            self.body.state, self.body.encoded_state = next_state, encoded_state
            return

        if not hasattr(
                self.body,
                'warmup_memory') or self.body.env.clock.epi > self.warmup_epi:
            self.body.memory.update(self.body.encoded_state, self.body.action,
                                    reward, encoded_state, done)
        else:
            self.body.warmup_memory.update(self.body.encoded_state,
                                           self.body.action, reward,
                                           encoded_state, done)

        # update body
        self.body.state, self.body.encoded_state = next_state, encoded_state

        # train algorithm
        loss = self.algorithm.train()
        if not np.isnan(loss):  # set for log_summary()
            self.body.loss = loss
        explore_var = self.algorithm.update()

        return loss, explore_var
コード例 #4
0
 def update(self, algorithm, clock):
     '''Get an updated value for var'''
     if (util.in_eval_lab_modes()) or self._updater_name == 'no_decay':
         return self.end_val
     step = clock.get()
     val = self._updater(self.start_val, self.end_val, self.start_step, self.end_step, step)
     return val
コード例 #5
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     Otherwise this function does nothing.
     '''
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         batch = self.sample()
         clock.set_batch_size(len(batch))
         loss = self.calc_q_loss(batch)
         self.net.train_step(loss,
                             self.optim,
                             self.lr_scheduler,
                             clock=clock,
                             global_net=self.global_net)
         # reset
         self.to_train = 0
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
コード例 #6
0
 def save(self, ckpt=None):
     '''Save agent'''
     if self.algorithm.__class__.__name__ == 'ExternalPolicy':
         return
     if util.in_eval_lab_modes():
         # eval does not save new models
         return
     self.algorithm.save(ckpt=ckpt)
コード例 #7
0
def random(state, algorithm, body):
    '''Random action using gym.action_space.sample(), with the same format as default()'''
    if body.env.is_venv and not util.in_eval_lab_modes():
        _action = [body.action_space.sample() for _ in range(body.env.num_envs)]
    else:
        _action = body.action_space.sample()
    action = torch.tensor([_action])
    return action
コード例 #8
0
 def act(self, state):
     '''Random action'''
     body = self.body
     if body.env.is_venv and not util.in_eval_lab_modes():
         action = np.array(
             [body.action_space.sample() for _ in range(body.env.num_envs)])
     else:
         action = body.action_space.sample()
     return action
コード例 #9
0
def guard_tensor(state, body):
    '''Guard-cast tensor before being input to network'''
    # if isinstance(state, LazyFrames):
    #     state = state.__array__()  # realize data
    state = torch.from_numpy(state.astype(np.float32))
    if not body.env.is_venv or util.in_eval_lab_modes():
        # singleton state, unsqueeze as minibatch for net input
        state = state.unsqueeze(dim=0)
    return state
コード例 #10
0
 def run(self):
     if util.in_eval_lab_modes():
         self.run_eval()
         metrics = None
     else:
         self.run_rl()
         metrics = analysis.analyze_session(self.spec, self.agent.body.eval_df, 'eval')
     self.close()
     return metrics
コード例 #11
0
 def update(self, state, action, reward, next_state, done):
     '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net'''
     self.body.update(state, action, reward, next_state, done)
     if util.in_eval_lab_modes():  # eval does not update agent for training
         return
     self.body.memory.update(state, action, reward, next_state, done)
     loss = self.algorithm.train()
     if not np.isnan(loss):  # set for log_summary()
         self.body.loss = loss
     explore_var = self.algorithm.update()
     return loss, explore_var
コード例 #12
0
ファイル: base.py プロジェクト: sherlock1987/Dp-without-Adv
 def post_init_nets(self):
     '''
     Method to conditionally load models.
     Call at the end of init_nets() after setting self.net_names
     '''
     assert hasattr(self, 'net_names')
     if util.in_eval_lab_modes():
         logger.info(f'Loaded algorithm models for lab_mode: {util.get_lab_mode()}')
         self.load()
     else:
         logger.info(f'Initialized algorithm models for lab_mode: {util.get_lab_mode()}')
コード例 #13
0
 def space_train(self):
     if util.in_eval_lab_modes():
         return np.nan
     losses = []
     for body in self.agent.nanflat_body_a:
         self.body = body
         losses.append(self.train())
     # set body reference back to default
     self.body = self.agent.nanflat_body_a[0]
     loss_a = self.nanflat_to_data_a('loss', losses)
     return loss_a
コード例 #14
0
def warmup_default(state, algorithm, body):
    action = default(state, algorithm, body)

    if util.in_eval_lab_modes():
        return action

    if body.env.clock.epi < algorithm.warmup_epi:
        if hasattr(body, 'state'):
            action = rule_guide(body.state, algorithm, body)
        else:
            action = rule_guide(state, algorithm, body)
    return action
コード例 #15
0
 def to_ckpt(self, env, mode='eval'):
     '''Check with clock whether to run log/eval ckpt: at the start, save_freq, and the end'''
     if mode == 'eval' and util.in_eval_lab_modes():  # avoid double-eval: eval-ckpt in eval mode
         return False
     clock = env.clock
     frame = clock.get()
     frequency = env.eval_frequency if mode == 'eval' else env.log_frequency
     if frame == 0 or clock.get('opt_step') == 0:  # avoid ckpt at init
         to_ckpt = False
     elif frequency is None:  # default episodic
         to_ckpt = env.done
     else:  # normal ckpt condition by mod remainder (general for venv)
         to_ckpt = util.frame_mod(frame, frequency, env.num_envs) or frame == clock.max_frame
     return to_ckpt
コード例 #16
0
def warmup_epsilon_greedy(state, algorithm, body):
    action = default(state, algorithm, body)

    if util.in_eval_lab_modes():
        return action

    epsilon = body.explore_var
    if epsilon > np.random.rand():
        action = random(state, algorithm, body)
    if body.env.clock.epi < algorithm.warmup_epi:
        if hasattr(body, 'state'):
            action = rule_guide(body.state, algorithm, body)
        else:
            action = rule_guide(state, algorithm, body)
    return action
コード例 #17
0
    def train(self):
        '''Train actor critic by computing the loss in batch efficiently'''
        if util.in_eval_lab_modes():
            return np.nan
        clock = self.body.env.clock
        if self.to_train == 1:
            batch = self.sample()
            """
            Add rewards over here.
            """
            batch = self.replace_reward_batch(batch)
            # batch = self.fetch_disc_reward(batch)
            clock.set_batch_size(len(batch))
            pdparams, v_preds = self.calc_pdparam_v(batch)

            # get loss of critic: advs and targets of critic v_targets.
            advs, v_targets = self.calc_advs_v_targets(batch, v_preds)
            policy_loss = self.calc_policy_loss(batch, pdparams,
                                                advs)  # from actor
            val_loss = self.calc_val_loss(v_preds, v_targets)  # from critic
            if self.shared:  # shared network
                loss = policy_loss + val_loss
                self.net.train_step(loss,
                                    self.optim,
                                    self.lr_scheduler,
                                    clock=clock,
                                    global_net=self.global_net)
            else:
                # not shared! F**k You!
                self.net.train_step(policy_loss,
                                    self.optim,
                                    self.lr_scheduler,
                                    clock=clock,
                                    global_net=self.global_net)
                self.critic_net.train_step(val_loss,
                                           self.critic_optim,
                                           self.critic_lr_scheduler,
                                           clock=clock,
                                           global_net=self.global_critic_net)
                loss = policy_loss + val_loss
            # reset
            self.to_train = 0
            logger.debug(
                f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}'
            )
            return loss.item()
        else:
            return np.nan
コード例 #18
0
 def train(self):
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     # import pdb; pdb.set_trace()
     # self.batch_count = 0
     # print("***********")
     if self.to_train == 1:
         # print("===========")
         self.reward_agent.eval()
         total_loss = torch.tensor(0.0)
         self.reward_count = 0
         self.batch_count = 0
         for _ in range(self.training_iter):
             batches = []
             warmup = False
             if self.body.warmup_memory.size >= self.body.warmup_memory.batch_size:
                 batches.append(self.warmup_sample())
                 # if self.body.env.clock.frame < 100000:
                 #     batches.append(self.warmup_sample())
                 # else:
                 #     batches.append(self.sample())
                 warmup = True
             if self.body.memory.size >= self.body.memory.batch_size:
                 batches.append(self.sample())
             clock.set_batch_size(sum(len(batch) for batch in batches))
             for idx, batch in enumerate(batches):
                 for _ in range(self.training_batch_iter):
                     loss = self.calc_q_loss(batch, False)
                     self.net.train_step(loss,
                                         self.optim,
                                         self.lr_scheduler,
                                         clock=clock,
                                         global_net=self.global_net)
                     total_loss += loss
         loss = total_loss / (self.training_iter * self.training_batch_iter)
         reward_irl = self.reward_count / self.batch_count
         logger.info("***********")
         logger.info(reward_irl)
         # reset
         self.to_train = 0
         logger.info(
             f'Trained {self.name} at epi: {clock.epi}, warmup_size: {self.body.warmup_memory.size}, memory_size: {self.body.memory.size}, loss: {loss:g}, irl_reward: {reward_irl}'
         )
         # logger.info(f'Trained {self.name} at epi: {clock.epi}, warmup_size: {self.body.warmup_memory.size}, memory_size: {self.body.memory.size}, loss: {loss:g}')
         return loss.item()
     else:
         return np.nan
コード例 #19
0
ファイル: base.py プロジェクト: temporaer/ConvLab
 def __init__(self, spec, e=None):
     self.e = e or 0  # for multi-env
     self.done = False
     self.env_spec = spec['env'][self.e]
     # set default
     util.set_attr(
         self,
         dict(
             log_frequency=None,  # default to log at epi done
             frame_op=None,
             frame_op_len=None,
             normalize_state=False,
             reward_scale=None,
             num_envs=None,
         ))
     util.set_attr(self, spec['meta'], [
         'log_frequency',
         'eval_frequency',
     ])
     util.set_attr(self, self.env_spec, [
         'name',
         'frame_op',
         'frame_op_len',
         'normalize_state',
         'reward_scale',
         'num_envs',
         'max_t',
         'max_frame',
     ])
     seq_len = ps.get(spec, 'agent.0.net.seq_len')
     if seq_len is not None:  # infer if using RNN
         self.frame_op = 'stack'
         self.frame_op_len = seq_len
     if util.in_eval_lab_modes():  # use singleton for eval
         self.num_envs = 1
         self.log_frequency = None
     if spec['meta'][
             'distributed'] != False:  # divide max_frame for distributed
         self.max_frame = int(self.max_frame / spec['meta']['max_session'])
     self.is_venv = (self.num_envs is not None and self.num_envs > 1)
     if self.is_venv:
         assert self.log_frequency is not None, f'Specify log_frequency when using venv'
     self.clock_speed = 1 * (
         self.num_envs or 1
     )  # tick with a multiple of num_envs to properly count frames
     self.clock = Clock(self.max_frame, self.clock_speed)
     self.to_render = util.to_render()
コード例 #20
0
 def train(self):
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         batch = self.sample()
         clock.set_batch_size(len(batch))
         pdparams = self.calc_pdparam_batch(batch)
         advs = self.calc_ret_advs(batch)
         loss = self.calc_policy_loss(batch, pdparams, advs)
         self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
         # reset
         self.to_train = 0
         logger.info(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
         return loss.item()
     else:
         return np.nan
コード例 #21
0
def load_algorithm(algorithm):
    '''Save all the nets for an algorithm'''
    agent = algorithm.agent
    net_names = algorithm.net_names
    if util.in_eval_lab_modes():
        # load specific model in eval mode
        model_prepath = agent.spec['meta']['eval_model_prepath']
    else:
        model_prepath = agent.spec['meta']['model_prepath']
    logger.info(f'Loading algorithm {util.get_class_name(algorithm)} nets {net_names} from {model_prepath}_*.pt')
    for net_name in net_names:
        net = getattr(algorithm, net_name)
        model_path = f'{model_prepath}_{net_name}_model.pt'
        load(net, model_path)
        optim_name = net_name.replace('net', 'optim')
        optim = getattr(algorithm, optim_name, None)
        if optim is not None:  # only trainable net has optim
            optim_path = f'{model_prepath}_{net_name}_optim.pt'
            load(optim, optim_path)
コード例 #22
0
 def save(self, ckpt=None):
     '''Save agent'''
     if util.in_eval_lab_modes():  # eval does not save new models
         return
     self.algorithm.save(ckpt=ckpt)
コード例 #23
0
 def train(self):
     '''Implement algorithm train, or throw NotImplementedError'''
     if util.in_eval_lab_modes():
         return np.nan
     raise NotImplementedError
コード例 #24
0
    def train(self):
        # torch.save(self.net.state_dict(), './reward_model/policy_pretrain.mdl')
        # raise ValueError("policy pretrain stops")
        if util.in_eval_lab_modes():
            return np.nan
        clock = self.body.env.clock
        if self.body.env.clock.epi > 700:
            self.pretrain_finished = True
            # torch.save(self.discriminator.state_dict(), './reward_model/airl_pretrain.mdl')
            # raise ValueError("pretrain stops here")
        if self.to_train == 1:
            net_util.copy(self.net, self.old_net)  # update old net
            batch = self.sample()
            if self.reward_type == 'OFFGAN':
                batch = self.replace_reward_batch(batch)
            # if self.reward_type =='DISC':
            # batch = self.fetch_disc_reward(batch)
            # if self.reward_type =='AIRL':
            # batch = self.fetch_airl_reward(batch)
            # if self.reward_type == 'OFFGAN_update':
            # batch = self.fetch_offgan_reward(batch)

            clock.set_batch_size(len(batch))
            _pdparams, v_preds = self.calc_pdparam_v(batch)
            advs, v_targets = self.calc_advs_v_targets(batch, v_preds)
            # piggy back on batch, but remember to not pack or unpack
            batch['advs'], batch['v_targets'] = advs, v_targets
            if self.body.env.is_venv:  # unpack if venv for minibatch sampling
                for k, v in batch.items():
                    if k not in ('advs', 'v_targets'):
                        batch[k] = math_util.venv_unpack(v)
            total_loss = torch.tensor(0.0)
            for _ in range(self.training_epoch):
                minibatches = util.split_minibatch(batch, self.minibatch_size)

                # if not self.pretrain_finished or not self.policy_training_flag:
                #     break

                for minibatch in minibatches:
                    if self.body.env.is_venv:  # re-pack to restore proper shape
                        for k, v in minibatch.items():
                            if k not in ('advs', 'v_targets'):
                                minibatch[k] = math_util.venv_pack(
                                    v, self.body.env.num_envs)
                    advs, v_targets = minibatch['advs'], minibatch['v_targets']
                    pdparams, v_preds = self.calc_pdparam_v(minibatch)
                    policy_loss = self.calc_policy_loss(
                        minibatch, pdparams, advs)  # from actor
                    val_loss = self.calc_val_loss(v_preds,
                                                  v_targets)  # from critic
                    if self.shared:  # shared network
                        loss = policy_loss + val_loss
                        self.net.train_step(loss,
                                            self.optim,
                                            self.lr_scheduler,
                                            clock=clock,
                                            global_net=self.global_net)
                    else:
                        # pretrain_finished = false -> policy keep fixed, updating value net and disc
                        if not self.pretrain_finished:
                            self.critic_net.train_step(
                                val_loss,
                                self.critic_optim,
                                self.critic_lr_scheduler,
                                clock=clock,
                                global_net=self.global_critic_net)
                            loss = val_loss
                        if self.pretrain_finished and self.policy_training_flag:
                            self.net.train_step(policy_loss,
                                                self.optim,
                                                self.lr_scheduler,
                                                clock=clock,
                                                global_net=self.global_net)
                            self.critic_net.train_step(
                                val_loss,
                                self.critic_optim,
                                self.critic_lr_scheduler,
                                clock=clock,
                                global_net=self.global_critic_net)
                            loss = policy_loss + val_loss

                    total_loss += loss
            loss = total_loss / self.training_epoch / len(minibatches)
            if not self.pretrain_finished:
                logger.info(
                    "warmup Value net, epi: {}, frame: {}, loss: {}".format(
                        clock.epi, clock.frame, loss))
            # reset
            self.to_train = 0
            self.policy_training_flag = False
            logger.debug(
                f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}'
            )
            return loss.item()
        else:
            return np.nan
コード例 #25
0
 def get_env(self):
     return self.body.eval_env if util.in_eval_lab_modes(
     ) else self.body.env
コード例 #26
0
 def train(self):
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         net_util.copy(self.net, self.old_net)  # update old net
         batch = self.sample()
         clock.set_batch_size(len(batch))
         _pdparams, v_preds = self.calc_pdparam_v(batch)
         advs, v_targets = self.calc_advs_v_targets(batch, v_preds)
         # piggy back on batch, but remember to not pack or unpack
         batch['advs'], batch['v_targets'] = advs, v_targets
         if self.body.env.is_venv:  # unpack if venv for minibatch sampling
             for k, v in batch.items():
                 if k not in ('advs', 'v_targets'):
                     batch[k] = math_util.venv_unpack(v)
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             minibatches = util.split_minibatch(batch, self.minibatch_size)
             for minibatch in minibatches:
                 if self.body.env.is_venv:  # re-pack to restore proper shape
                     for k, v in minibatch.items():
                         if k not in ('advs', 'v_targets'):
                             minibatch[k] = math_util.venv_pack(
                                 v, self.body.env.num_envs)
                 advs, v_targets = minibatch['advs'], minibatch['v_targets']
                 pdparams, v_preds = self.calc_pdparam_v(minibatch)
                 policy_loss = self.calc_policy_loss(
                     minibatch, pdparams, advs)  # from actor
                 val_loss = self.calc_val_loss(v_preds,
                                               v_targets)  # from critic
                 if self.shared:  # shared network
                     loss = policy_loss + val_loss
                     self.net.train_step(loss,
                                         self.optim,
                                         self.lr_scheduler,
                                         clock=clock,
                                         global_net=self.global_net)
                 else:
                     self.net.train_step(policy_loss,
                                         self.optim,
                                         self.lr_scheduler,
                                         clock=clock,
                                         global_net=self.global_net)
                     self.critic_net.train_step(
                         val_loss,
                         self.critic_optim,
                         self.critic_lr_scheduler,
                         clock=clock,
                         global_net=self.global_critic_net)
                     loss = policy_loss + val_loss
                 total_loss += loss
         loss = total_loss / self.training_epoch / len(minibatches)
         # reset
         self.to_train = 0
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan